zanachka · pull · May 3, 2025 · May 1, 2025 · May 1, 2025 · May 3, 2025
diff --git a/.gitignore b/.gitignore
@@ -14,4 +14,5 @@ nosetests.xml
 .idea
 .cache
 /.noseids
-/.venv
+/.venv
+/poetry.lock
diff --git a/Makefile b/Makefile
@@ -4,7 +4,7 @@ PY := .venv/bin/python
 PIP := .venv/bin/pip
 PEP8 := .venv/bin/pep8
 NOSE := .venv/bin/nosetests
-TWINE := twine
+TWINE := .venv/bin/twine
 
 # ###########
 # Tests rule!
@@ -50,13 +50,14 @@ clean_all: clean_venv
 # ###########
 .PHONY: dist
 dist:
+	$(PY) -m pip install wheel
 	$(PY) setup.py sdist bdist_wheel
 	$(TWINE) check dist/*
 
 .PHONY: upload
 upload:
 	$(TWINE) upload dist/*
 
-.PHONY: version_update
-version_update:
-	$(EDITOR) setup.py
+.PHONY: bump
+bump:
+	$(EDITOR) readability/__init__.py
diff --git a/README.md b/README.md
@@ -0,0 +1,67 @@
+[![PyPI version](https://img.shields.io/pypi/v/readability-lxml.svg)](https://pypi.python.org/pypi/readability-lxml)
+
+# python-readability
+
+Given an HTML document, extract and clean up the main body text and title.
+
+This is a Python port of a Ruby port of [arc90's Readability project](https://web.archive.org/web/20130519040221/http://www.readability.com/).
+
+## Installation
+
+It's easy using `pip`, just run:
+
+```bash
+$ pip install readability-lxml
+```
+
+As an alternative, you may also use conda to install, just run:
+
+```bash
+$ conda install -c conda-forge readability-lxml
+```
+
+## Usage
+
+```python
+>>> import requests
+>>> from readability import Document
+
+>>> response = requests.get('http://example.com')
+>>> doc = Document(response.content)
+>>> doc.title()
+'Example Domain'
+
+>>> doc.summary()
+"""<html><body><div><body id="readabilityBody">\n<div>\n    <h1>Example Domain</h1>\n
+<p>This domain is established to be used for illustrative examples in documents. You may
+use this\n    domain in examples without prior coordination or asking for permission.</p>
+\n    <p><a href="http://www.iana.org/domains/example">More information...</a></p>\n</div>
+\n</body>\n</div></body></html>"""
+```
+
+## Change Log
+- 0.8.4 Better CJK support, thanks @cdhigh
+- 0.8.3.1 Support for python 3.8 - 3.13
+- 0.8.3 We can now save all images via keep_all_images=True (default is to save 1 main image), thanks @botlabsDev
+- 0.8.2 Added article author(s) (thanks @mattblaha)
+- 0.8.1 Fixed processing of non-ascii HTMLs via regexps.
+- 0.8 Replaced XHTML output with HTML5 output in summary() call.
+- 0.7.1 Support for Python 3.7 . Fixed a slowdown when processing documents with lots of spaces.
+- 0.7 Improved HTML5 tags handling. Fixed stripping unwanted HTML nodes (only first matching node was removed before).
+- 0.6 Finally a release which supports Python versions 2.6, 2.7, 3.3 - 3.6
+- 0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and 3.4
+- 0.4 Added Videos loading and allowed more images per paragraph
+- 0.3 Added Document.encoding, positive\_keywords and negative\_keywords
+
+## Licensing
+
+This code is under [the Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0) license.
+
+## Thanks to
+
+- Latest [readability.js](https://github.com/MHordecki/readability-redux/blob/master/readability/readability.js)
+- Ruby port by starrhorne and iterationlabs
+- [Python port](https://github.com/gfxmonk/python-readability) by gfxmonk
+- [Decruft effort](https://web.archive.org/web/20110214150709/https://www.minvolai.com/blog/decruft-arc90s-readability-in-python/) to move to lxml
+- "BR to P" fix from readability.js which improves quality for smaller texts
+- Github users contributions.
diff --git a/README.rst b/README.rst
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,19 @@
+[tool.poetry]
+name = "readability-lxml"
+version = "0.8.4"
+description = "fast html to text parser (article readability tool) with python 3 support"
+authors = ["Yuri Baburov <burchik@gmail.com>"]
+license = "Apache License 2.0"
+readme = "README.rst"
+
+[tool.poetry.dependencies]
+python = ">=3.8.2,<3.14"
+chardet = "^5.2.0"
+cssselect = "~1.2"
+lxml = {extras = ["html-clean"], version = "^5.4.0"}
+lxml-html-clean = {markers = "python_version < \"3.11\"", version = "^0.4.2"}
+
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
diff --git a/readability/__init__.py b/readability/__init__.py
@@ -1,3 +1,3 @@
-__version__ = "0.8.3"
+__version__ = "0.8.4"
 
 from .readability import Document
diff --git a/readability/cleaners.py b/readability/cleaners.py
@@ -1,6 +1,9 @@
 # strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
 import re
-from lxml.html.clean import Cleaner
+try:
+    from lxml.html.clean import Cleaner
+except ImportError:
+    from lxml_html_clean import Cleaner
 
 bad_attrs = ["width", "height", "style", "[-a-z]*color", "background[-a-z]*", "on*"]
 single_quoted = "'[^']+'"

diff --git a/readability/htmls.py b/readability/htmls.py
@@ -110,29 +110,35 @@ def shorten_title(doc):
             if e.text_content():
                 add_match(candidates, e.text_content(), orig)
 
+    cjk = re.compile('[\u4e00-\u9fff]+')
+
     if candidates:
         title = sorted(candidates, key=len)[-1]
     else:
         for delimiter in [" | ", " - ", " :: ", " / "]:
             if delimiter in title:
                 parts = orig.split(delimiter)
-                if len(parts[0].split()) >= 4:
-                    title = parts[0]
+                p0 = parts[0]
+                pl = parts[-1]
+                if (len(p0.split()) >= 4) or (len(p0) >= 4 and cjk.search(p0)):
+                    title = p0
                     break
-                elif len(parts[-1].split()) >= 4:
-                    title = parts[-1]
+                elif (len(p1.split()) >= 4) or (len(p1) >= 4 and cjk.search(p1)):
+                    title = p1
                     break
         else:
             if ": " in title:
-                parts = orig.split(": ")
-                if len(parts[-1].split()) >= 4:
-                    title = parts[-1]
+                p1 = orig.split(": ")[-1]
+                if (len(p1.split()) >= 4) or (len(p1) >= 4 and cjk.search(p1)):
+                    title = p1
                 else:
                     title = orig.split(": ", 1)[1]
 
-    if not 15 < len(title) < 150:
+    if cjk.search(title) and not (4 <= len(title) < 100):
         return orig
-
+    elif not 15 < len(title) < 150:
+        return orig
+
     return title
 
 

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,8 +1,2 @@
-lxml
-lxml_html_clean
-pytest
-chardet
 nose
-pep8
-coverage
-wrapt-timeout-decorator
+twine
diff --git a/requirements.txt b/requirements.txt
diff --git a/setup.py b/setup.py
@@ -37,8 +37,8 @@ def find_version(*file_paths):
     author_email="burchik@gmail.com",
     description="fast html to text parser (article readability tool) with python 3 support",
     test_suite="tests.test_article_only",
-    long_description=open("README.rst").read(),
-    long_description_content_type='text/x-rst',
+    long_description=open("README.md").read(),
+    long_description_content_type="text/markdown",
     license="Apache License 2.0",
     url="http://github.com/buriy/python-readability",
     packages=["readability"],
-Original file line number
+Diff line change
@@ Expand Up / @@ -14,4 +14,5 @@ nosetests.xml @@
     .idea
     .cache
     /.noseids
-    /.venv
+    /.venv
+    /poetry.lock