From c1574456f5aefc1dc05d7def332c48e3799e214c Mon Sep 17 00:00:00 2001 From: cdhigh Date: Thu, 1 May 2025 10:37:30 -0300 Subject: [PATCH 1/6] shorten_title supports CJK character sets. --- readability/htmls.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/readability/htmls.py b/readability/htmls.py index 87299f5..b090aa5 100644 --- a/readability/htmls.py +++ b/readability/htmls.py @@ -110,29 +110,35 @@ def shorten_title(doc): if e.text_content(): add_match(candidates, e.text_content(), orig) + cjk = re.compile('[\u4e00-\u9fff]+') + if candidates: title = sorted(candidates, key=len)[-1] else: for delimiter in [" | ", " - ", " :: ", " / "]: if delimiter in title: parts = orig.split(delimiter) - if len(parts[0].split()) >= 4: - title = parts[0] + p0 = parts[0] + pl = parts[-1] + if (len(p0.split()) >= 4) or (len(p0) >= 4 and cjk.search(p0)): + title = p0 break - elif len(parts[-1].split()) >= 4: - title = parts[-1] + elif (len(p1.split()) >= 4) or (len(p1) >= 4 and cjk.search(p1)): + title = p1 break else: if ": " in title: - parts = orig.split(": ") - if len(parts[-1].split()) >= 4: - title = parts[-1] + p1 = orig.split(": ")[-1] + if (len(p1.split()) >= 4) or (len(p1) >= 4 and cjk.search(p1)): + title = p1 else: title = orig.split(": ", 1)[1] - if not 15 < len(title) < 150: + if cjk.search(title) and not (4 <= len(title) < 100): return orig - + elif not 15 < len(title) < 150: + return orig + return title From 16ce81dd89bf25b179dced79070fb933857e5dc6 Mon Sep 17 00:00:00 2001 From: cdhigh Date: Thu, 1 May 2025 10:47:50 -0300 Subject: [PATCH 2/6] Update cleaners.py --- readability/cleaners.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/readability/cleaners.py b/readability/cleaners.py index 69825c6..e0b0726 100644 --- a/readability/cleaners.py +++ b/readability/cleaners.py @@ -1,6 +1,9 @@ # strip out a set of nuisance html attributes that can mess up rendering in RSS feeds import re -from lxml.html.clean import Cleaner +try: + from lxml.html.clean import Cleaner +except ImportError: + from lxml_html_clean import Cleaner bad_attrs = ["width", "height", "style", "[-a-z]*color", "background[-a-z]*", "on*"] single_quoted = "'[^']+'" From f02d865bc4afc435cc02224a6915494a33abe629 Mon Sep 17 00:00:00 2001 From: Yuri Baburov Date: Sat, 3 May 2025 18:39:26 +0700 Subject: [PATCH 3/6] Added nose to requirements-dev so "make test" will work again. --- requirements-dev.txt | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 6160e33..9f580cb 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,8 +1 @@ -lxml -lxml_html_clean -pytest -chardet -nose -pep8 -coverage -wrapt-timeout-decorator +nose \ No newline at end of file From 6f1b449962fe577e8d695d3147a9fbc21b9bd333 Mon Sep 17 00:00:00 2001 From: Yuri Baburov Date: Sat, 3 May 2025 18:49:08 +0700 Subject: [PATCH 4/6] Better CJK support (and fix for lxml-clean), thanks @cdhigh --- .gitignore | 3 ++- Makefile | 7 ++++--- README.rst | 4 +++- pyproject.toml | 19 +++++++++++++++++++ readability/__init__.py | 2 +- 5 files changed, 29 insertions(+), 6 deletions(-) create mode 100644 pyproject.toml diff --git a/.gitignore b/.gitignore index d896106..b532e65 100644 --- a/.gitignore +++ b/.gitignore @@ -14,4 +14,5 @@ nosetests.xml .idea .cache /.noseids -/.venv \ No newline at end of file +/.venv +/poetry.lock \ No newline at end of file diff --git a/Makefile b/Makefile index 012e4b7..f1c8f21 100644 --- a/Makefile +++ b/Makefile @@ -50,6 +50,7 @@ clean_all: clean_venv # ########### .PHONY: dist dist: + $(PY) -m pip install wheel $(PY) setup.py sdist bdist_wheel $(TWINE) check dist/* @@ -57,6 +58,6 @@ dist: upload: $(TWINE) upload dist/* -.PHONY: version_update -version_update: - $(EDITOR) setup.py +.PHONY: bump +bump: + $(EDITOR) readability/__init__.py diff --git a/README.rst b/README.rst index 9b0a8b7..72b4e63 100644 --- a/README.rst +++ b/README.rst @@ -48,7 +48,9 @@ Usage Change Log ---------- - +- 0.8.4 Better CJK support, thanks @cdhigh +- 0.8.3.1 Support for python 3.8 - 3.13 +- 0.8.3 We can now save all images via keep_all_images=True (default is to save 1 main image), thanks @botlabsDev - 0.8.2 Added article author(s) (thanks @mattblaha) - 0.8.1 Fixed processing of non-ascii HTMLs via regexps. - 0.8 Replaced XHTML output with HTML5 output in summary() call. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..4dad46a --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,19 @@ +[tool.poetry] +name = "readability-lxml" +version = "0.8.4" +description = "fast html to text parser (article readability tool) with python 3 support" +authors = ["Yuri Baburov "] +license = "Apache License 2.0" +readme = "README.rst" + +[tool.poetry.dependencies] +python = ">=3.8.2,<3.14" +chardet = "^5.2.0" +cssselect = "~1.2" +lxml = {extras = ["html-clean"], version = "^5.4.0"} +lxml-html-clean = {markers = "python_version < \"3.11\"", version = "^0.4.2"} + + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/readability/__init__.py b/readability/__init__.py index 18dccae..f27111b 100644 --- a/readability/__init__.py +++ b/readability/__init__.py @@ -1,3 +1,3 @@ -__version__ = "0.8.3" +__version__ = "0.8.4" from .readability import Document From 344ba9e7c4839019af1d6aace030a9425eeb06cf Mon Sep 17 00:00:00 2001 From: Yuri Baburov Date: Sat, 3 May 2025 18:49:08 +0700 Subject: [PATCH 5/6] Better CJK support (and fix for lxml-clean), thanks @cdhigh --- requirements.txt | 1 - 1 file changed, 1 deletion(-) delete mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index d6e1198..0000000 --- a/requirements.txt +++ /dev/null @@ -1 +0,0 @@ --e . From be72501fec6d4924ca97cdfccfa03eaad57cc249 Mon Sep 17 00:00:00 2001 From: Yuri Baburov Date: Sat, 3 May 2025 19:11:41 +0700 Subject: [PATCH 6/6] Updates for publishing. --- Makefile | 2 +- README.md | 67 +++++++++++++++++++++++++++++++++++++ README.rst | 78 -------------------------------------------- requirements-dev.txt | 3 +- setup.py | 4 +-- 5 files changed, 72 insertions(+), 82 deletions(-) create mode 100644 README.md delete mode 100644 README.rst diff --git a/Makefile b/Makefile index f1c8f21..ba14e4f 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ PY := .venv/bin/python PIP := .venv/bin/pip PEP8 := .venv/bin/pep8 NOSE := .venv/bin/nosetests -TWINE := twine +TWINE := .venv/bin/twine # ########### # Tests rule! diff --git a/README.md b/README.md new file mode 100644 index 0000000..e09a515 --- /dev/null +++ b/README.md @@ -0,0 +1,67 @@ +[![PyPI version](https://img.shields.io/pypi/v/readability-lxml.svg)](https://pypi.python.org/pypi/readability-lxml) + +# python-readability + +Given an HTML document, extract and clean up the main body text and title. + +This is a Python port of a Ruby port of [arc90's Readability project](https://web.archive.org/web/20130519040221/http://www.readability.com/). + +## Installation + +It's easy using `pip`, just run: + +```bash +$ pip install readability-lxml +``` + +As an alternative, you may also use conda to install, just run: + +```bash +$ conda install -c conda-forge readability-lxml +``` + +## Usage + +```python +>>> import requests +>>> from readability import Document + +>>> response = requests.get('http://example.com') +>>> doc = Document(response.content) +>>> doc.title() +'Example Domain' + +>>> doc.summary() +"""
\n
\n

Example Domain

\n +

This domain is established to be used for illustrative examples in documents. You may +use this\n domain in examples without prior coordination or asking for permission.

+\n

More information...

\n
+\n\n
""" +``` + +## Change Log +- 0.8.4 Better CJK support, thanks @cdhigh +- 0.8.3.1 Support for python 3.8 - 3.13 +- 0.8.3 We can now save all images via keep_all_images=True (default is to save 1 main image), thanks @botlabsDev +- 0.8.2 Added article author(s) (thanks @mattblaha) +- 0.8.1 Fixed processing of non-ascii HTMLs via regexps. +- 0.8 Replaced XHTML output with HTML5 output in summary() call. +- 0.7.1 Support for Python 3.7 . Fixed a slowdown when processing documents with lots of spaces. +- 0.7 Improved HTML5 tags handling. Fixed stripping unwanted HTML nodes (only first matching node was removed before). +- 0.6 Finally a release which supports Python versions 2.6, 2.7, 3.3 - 3.6 +- 0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and 3.4 +- 0.4 Added Videos loading and allowed more images per paragraph +- 0.3 Added Document.encoding, positive\_keywords and negative\_keywords + +## Licensing + +This code is under [the Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0) license. + +## Thanks to + +- Latest [readability.js](https://github.com/MHordecki/readability-redux/blob/master/readability/readability.js) +- Ruby port by starrhorne and iterationlabs +- [Python port](https://github.com/gfxmonk/python-readability) by gfxmonk +- [Decruft effort](https://web.archive.org/web/20110214150709/https://www.minvolai.com/blog/decruft-arc90s-readability-in-python/) to move to lxml +- "BR to P" fix from readability.js which improves quality for smaller texts +- Github users contributions. diff --git a/README.rst b/README.rst deleted file mode 100644 index 72b4e63..0000000 --- a/README.rst +++ /dev/null @@ -1,78 +0,0 @@ -.. image:: https://travis-ci.org/buriy/python-readability.svg?branch=master - :target: https://travis-ci.org/buriy/python-readability -.. image:: https://img.shields.io/pypi/v/readability-lxml.svg - :target: https://pypi.python.org/pypi/readability-lxml - -python-readability -================== - -Given an HTML document, extract and clean up the main body text and title. - -This is a Python port of a Ruby port of `arc90's Readability -project `__. - -Installation ------------- - -It's easy using ``pip``, just run: - -.. code-block:: bash - - $ pip install readability-lxml - -As an alternative, you may also use conda to install, just run: - -.. code-block:: bash - - $ conda install -c conda-forge readability-lxml - -Usage ------ - -.. code-block:: python - - >>> import requests - >>> from readability import Document - - >>> response = requests.get('http://example.com') - >>> doc = Document(response.content) - >>> doc.title() - 'Example Domain' - - >>> doc.summary() - """
\n
\n

Example Domain

\n -

This domain is established to be used for illustrative examples in documents. You may - use this\n domain in examples without prior coordination or asking for permission.

- \n

More information...

\n
- \n\n
""" - -Change Log ----------- -- 0.8.4 Better CJK support, thanks @cdhigh -- 0.8.3.1 Support for python 3.8 - 3.13 -- 0.8.3 We can now save all images via keep_all_images=True (default is to save 1 main image), thanks @botlabsDev -- 0.8.2 Added article author(s) (thanks @mattblaha) -- 0.8.1 Fixed processing of non-ascii HTMLs via regexps. -- 0.8 Replaced XHTML output with HTML5 output in summary() call. -- 0.7.1 Support for Python 3.7 . Fixed a slowdown when processing documents with lots of spaces. -- 0.7 Improved HTML5 tags handling. Fixed stripping unwanted HTML nodes (only first matching node was removed before). -- 0.6 Finally a release which supports Python versions 2.6, 2.7, 3.3 - 3.6 -- 0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and 3.4 -- 0.4 Added Videos loading and allowed more images per paragraph -- 0.3 Added Document.encoding, positive\_keywords and negative\_keywords - -Licensing ---------- - -This code is under `the Apache License -2.0 `__ license. - -Thanks to ---------- - -- Latest `readability.js `__ -- Ruby port by starrhorne and iterationlabs -- `Python port `__ by gfxmonk -- `Decruft effort ` to move to lxml -- "BR to P" fix from readability.js which improves quality for smaller texts -- Github users contributions. diff --git a/requirements-dev.txt b/requirements-dev.txt index 9f580cb..bc876e5 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1 +1,2 @@ -nose \ No newline at end of file +nose +twine \ No newline at end of file diff --git a/setup.py b/setup.py index 1819b41..a88e818 100755 --- a/setup.py +++ b/setup.py @@ -37,8 +37,8 @@ def find_version(*file_paths): author_email="burchik@gmail.com", description="fast html to text parser (article readability tool) with python 3 support", test_suite="tests.test_article_only", - long_description=open("README.rst").read(), - long_description_content_type='text/x-rst', + long_description=open("README.md").read(), + long_description_content_type="text/markdown", license="Apache License 2.0", url="http://github.com/buriy/python-readability", packages=["readability"],