diff --git a/.gitignore b/.gitignore index d8961065..b532e65e 100644 --- a/.gitignore +++ b/.gitignore @@ -14,4 +14,5 @@ nosetests.xml .idea .cache /.noseids -/.venv \ No newline at end of file +/.venv +/poetry.lock \ No newline at end of file diff --git a/Makefile b/Makefile index 012e4b78..ba14e4f3 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ PY := .venv/bin/python PIP := .venv/bin/pip PEP8 := .venv/bin/pep8 NOSE := .venv/bin/nosetests -TWINE := twine +TWINE := .venv/bin/twine # ########### # Tests rule! @@ -50,6 +50,7 @@ clean_all: clean_venv # ########### .PHONY: dist dist: + $(PY) -m pip install wheel $(PY) setup.py sdist bdist_wheel $(TWINE) check dist/* @@ -57,6 +58,6 @@ dist: upload: $(TWINE) upload dist/* -.PHONY: version_update -version_update: - $(EDITOR) setup.py +.PHONY: bump +bump: + $(EDITOR) readability/__init__.py diff --git a/README.md b/README.md new file mode 100644 index 00000000..e09a515a --- /dev/null +++ b/README.md @@ -0,0 +1,67 @@ +[![PyPI version](https://img.shields.io/pypi/v/readability-lxml.svg)](https://pypi.python.org/pypi/readability-lxml) + +# python-readability + +Given an HTML document, extract and clean up the main body text and title. + +This is a Python port of a Ruby port of [arc90's Readability project](https://web.archive.org/web/20130519040221/http://www.readability.com/). + +## Installation + +It's easy using `pip`, just run: + +```bash +$ pip install readability-lxml +``` + +As an alternative, you may also use conda to install, just run: + +```bash +$ conda install -c conda-forge readability-lxml +``` + +## Usage + +```python +>>> import requests +>>> from readability import Document + +>>> response = requests.get('http://example.com') +>>> doc = Document(response.content) +>>> doc.title() +'Example Domain' + +>>> doc.summary() +"""
\n
\n

Example Domain

\n +

This domain is established to be used for illustrative examples in documents. You may +use this\n domain in examples without prior coordination or asking for permission.

+\n

More information...

\n
+\n\n
""" +``` + +## Change Log +- 0.8.4 Better CJK support, thanks @cdhigh +- 0.8.3.1 Support for python 3.8 - 3.13 +- 0.8.3 We can now save all images via keep_all_images=True (default is to save 1 main image), thanks @botlabsDev +- 0.8.2 Added article author(s) (thanks @mattblaha) +- 0.8.1 Fixed processing of non-ascii HTMLs via regexps. +- 0.8 Replaced XHTML output with HTML5 output in summary() call. +- 0.7.1 Support for Python 3.7 . Fixed a slowdown when processing documents with lots of spaces. +- 0.7 Improved HTML5 tags handling. Fixed stripping unwanted HTML nodes (only first matching node was removed before). +- 0.6 Finally a release which supports Python versions 2.6, 2.7, 3.3 - 3.6 +- 0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and 3.4 +- 0.4 Added Videos loading and allowed more images per paragraph +- 0.3 Added Document.encoding, positive\_keywords and negative\_keywords + +## Licensing + +This code is under [the Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0) license. + +## Thanks to + +- Latest [readability.js](https://github.com/MHordecki/readability-redux/blob/master/readability/readability.js) +- Ruby port by starrhorne and iterationlabs +- [Python port](https://github.com/gfxmonk/python-readability) by gfxmonk +- [Decruft effort](https://web.archive.org/web/20110214150709/https://www.minvolai.com/blog/decruft-arc90s-readability-in-python/) to move to lxml +- "BR to P" fix from readability.js which improves quality for smaller texts +- Github users contributions. diff --git a/README.rst b/README.rst deleted file mode 100644 index 9b0a8b71..00000000 --- a/README.rst +++ /dev/null @@ -1,76 +0,0 @@ -.. image:: https://travis-ci.org/buriy/python-readability.svg?branch=master - :target: https://travis-ci.org/buriy/python-readability -.. image:: https://img.shields.io/pypi/v/readability-lxml.svg - :target: https://pypi.python.org/pypi/readability-lxml - -python-readability -================== - -Given an HTML document, extract and clean up the main body text and title. - -This is a Python port of a Ruby port of `arc90's Readability -project `__. - -Installation ------------- - -It's easy using ``pip``, just run: - -.. code-block:: bash - - $ pip install readability-lxml - -As an alternative, you may also use conda to install, just run: - -.. code-block:: bash - - $ conda install -c conda-forge readability-lxml - -Usage ------ - -.. code-block:: python - - >>> import requests - >>> from readability import Document - - >>> response = requests.get('http://example.com') - >>> doc = Document(response.content) - >>> doc.title() - 'Example Domain' - - >>> doc.summary() - """
\n
\n

Example Domain

\n -

This domain is established to be used for illustrative examples in documents. You may - use this\n domain in examples without prior coordination or asking for permission.

- \n

More information...

\n
- \n\n
""" - -Change Log ----------- - -- 0.8.2 Added article author(s) (thanks @mattblaha) -- 0.8.1 Fixed processing of non-ascii HTMLs via regexps. -- 0.8 Replaced XHTML output with HTML5 output in summary() call. -- 0.7.1 Support for Python 3.7 . Fixed a slowdown when processing documents with lots of spaces. -- 0.7 Improved HTML5 tags handling. Fixed stripping unwanted HTML nodes (only first matching node was removed before). -- 0.6 Finally a release which supports Python versions 2.6, 2.7, 3.3 - 3.6 -- 0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and 3.4 -- 0.4 Added Videos loading and allowed more images per paragraph -- 0.3 Added Document.encoding, positive\_keywords and negative\_keywords - -Licensing ---------- - -This code is under `the Apache License -2.0 `__ license. - -Thanks to ---------- - -- Latest `readability.js `__ -- Ruby port by starrhorne and iterationlabs -- `Python port `__ by gfxmonk -- `Decruft effort ` to move to lxml -- "BR to P" fix from readability.js which improves quality for smaller texts -- Github users contributions. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..4dad46a5 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,19 @@ +[tool.poetry] +name = "readability-lxml" +version = "0.8.4" +description = "fast html to text parser (article readability tool) with python 3 support" +authors = ["Yuri Baburov "] +license = "Apache License 2.0" +readme = "README.rst" + +[tool.poetry.dependencies] +python = ">=3.8.2,<3.14" +chardet = "^5.2.0" +cssselect = "~1.2" +lxml = {extras = ["html-clean"], version = "^5.4.0"} +lxml-html-clean = {markers = "python_version < \"3.11\"", version = "^0.4.2"} + + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/readability/__init__.py b/readability/__init__.py index 18dccaeb..f27111b7 100644 --- a/readability/__init__.py +++ b/readability/__init__.py @@ -1,3 +1,3 @@ -__version__ = "0.8.3" +__version__ = "0.8.4" from .readability import Document diff --git a/readability/cleaners.py b/readability/cleaners.py index 69825c6b..e0b07260 100644 --- a/readability/cleaners.py +++ b/readability/cleaners.py @@ -1,6 +1,9 @@ # strip out a set of nuisance html attributes that can mess up rendering in RSS feeds import re -from lxml.html.clean import Cleaner +try: + from lxml.html.clean import Cleaner +except ImportError: + from lxml_html_clean import Cleaner bad_attrs = ["width", "height", "style", "[-a-z]*color", "background[-a-z]*", "on*"] single_quoted = "'[^']+'" diff --git a/readability/htmls.py b/readability/htmls.py index 87299f5a..b090aa5f 100644 --- a/readability/htmls.py +++ b/readability/htmls.py @@ -110,29 +110,35 @@ def shorten_title(doc): if e.text_content(): add_match(candidates, e.text_content(), orig) + cjk = re.compile('[\u4e00-\u9fff]+') + if candidates: title = sorted(candidates, key=len)[-1] else: for delimiter in [" | ", " - ", " :: ", " / "]: if delimiter in title: parts = orig.split(delimiter) - if len(parts[0].split()) >= 4: - title = parts[0] + p0 = parts[0] + pl = parts[-1] + if (len(p0.split()) >= 4) or (len(p0) >= 4 and cjk.search(p0)): + title = p0 break - elif len(parts[-1].split()) >= 4: - title = parts[-1] + elif (len(p1.split()) >= 4) or (len(p1) >= 4 and cjk.search(p1)): + title = p1 break else: if ": " in title: - parts = orig.split(": ") - if len(parts[-1].split()) >= 4: - title = parts[-1] + p1 = orig.split(": ")[-1] + if (len(p1.split()) >= 4) or (len(p1) >= 4 and cjk.search(p1)): + title = p1 else: title = orig.split(": ", 1)[1] - if not 15 < len(title) < 150: + if cjk.search(title) and not (4 <= len(title) < 100): return orig - + elif not 15 < len(title) < 150: + return orig + return title diff --git a/requirements-dev.txt b/requirements-dev.txt index 6160e33c..bc876e5a 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,8 +1,2 @@ -lxml -lxml_html_clean -pytest -chardet nose -pep8 -coverage -wrapt-timeout-decorator +twine \ No newline at end of file diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index d6e1198b..00000000 --- a/requirements.txt +++ /dev/null @@ -1 +0,0 @@ --e . diff --git a/setup.py b/setup.py index 1819b41a..a88e8185 100755 --- a/setup.py +++ b/setup.py @@ -37,8 +37,8 @@ def find_version(*file_paths): author_email="burchik@gmail.com", description="fast html to text parser (article readability tool) with python 3 support", test_suite="tests.test_article_only", - long_description=open("README.rst").read(), - long_description_content_type='text/x-rst', + long_description=open("README.md").read(), + long_description_content_type="text/markdown", license="Apache License 2.0", url="http://github.com/buriy/python-readability", packages=["readability"],