diff --git a/.gitignore b/.gitignore
index d8961065..b532e65e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,4 +14,5 @@ nosetests.xml
.idea
.cache
/.noseids
-/.venv
\ No newline at end of file
+/.venv
+/poetry.lock
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 012e4b78..ba14e4f3 100644
--- a/Makefile
+++ b/Makefile
@@ -4,7 +4,7 @@ PY := .venv/bin/python
PIP := .venv/bin/pip
PEP8 := .venv/bin/pep8
NOSE := .venv/bin/nosetests
-TWINE := twine
+TWINE := .venv/bin/twine
# ###########
# Tests rule!
@@ -50,6 +50,7 @@ clean_all: clean_venv
# ###########
.PHONY: dist
dist:
+ $(PY) -m pip install wheel
$(PY) setup.py sdist bdist_wheel
$(TWINE) check dist/*
@@ -57,6 +58,6 @@ dist:
upload:
$(TWINE) upload dist/*
-.PHONY: version_update
-version_update:
- $(EDITOR) setup.py
+.PHONY: bump
+bump:
+ $(EDITOR) readability/__init__.py
diff --git a/README.md b/README.md
new file mode 100644
index 00000000..e09a515a
--- /dev/null
+++ b/README.md
@@ -0,0 +1,67 @@
+[](https://pypi.python.org/pypi/readability-lxml)
+
+# python-readability
+
+Given an HTML document, extract and clean up the main body text and title.
+
+This is a Python port of a Ruby port of [arc90's Readability project](https://web.archive.org/web/20130519040221/http://www.readability.com/).
+
+## Installation
+
+It's easy using `pip`, just run:
+
+```bash
+$ pip install readability-lxml
+```
+
+As an alternative, you may also use conda to install, just run:
+
+```bash
+$ conda install -c conda-forge readability-lxml
+```
+
+## Usage
+
+```python
+>>> import requests
+>>> from readability import Document
+
+>>> response = requests.get('http://example.com')
+>>> doc = Document(response.content)
+>>> doc.title()
+'Example Domain'
+
+>>> doc.summary()
+"""
\n
\n
Example Domain
\n
+
This domain is established to be used for illustrative examples in documents. You may
+use this\n domain in examples without prior coordination or asking for permission.
+\n
More information...
\n
+\n\n
"""
+```
+
+## Change Log
+- 0.8.4 Better CJK support, thanks @cdhigh
+- 0.8.3.1 Support for python 3.8 - 3.13
+- 0.8.3 We can now save all images via keep_all_images=True (default is to save 1 main image), thanks @botlabsDev
+- 0.8.2 Added article author(s) (thanks @mattblaha)
+- 0.8.1 Fixed processing of non-ascii HTMLs via regexps.
+- 0.8 Replaced XHTML output with HTML5 output in summary() call.
+- 0.7.1 Support for Python 3.7 . Fixed a slowdown when processing documents with lots of spaces.
+- 0.7 Improved HTML5 tags handling. Fixed stripping unwanted HTML nodes (only first matching node was removed before).
+- 0.6 Finally a release which supports Python versions 2.6, 2.7, 3.3 - 3.6
+- 0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and 3.4
+- 0.4 Added Videos loading and allowed more images per paragraph
+- 0.3 Added Document.encoding, positive\_keywords and negative\_keywords
+
+## Licensing
+
+This code is under [the Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0) license.
+
+## Thanks to
+
+- Latest [readability.js](https://github.com/MHordecki/readability-redux/blob/master/readability/readability.js)
+- Ruby port by starrhorne and iterationlabs
+- [Python port](https://github.com/gfxmonk/python-readability) by gfxmonk
+- [Decruft effort](https://web.archive.org/web/20110214150709/https://www.minvolai.com/blog/decruft-arc90s-readability-in-python/) to move to lxml
+- "BR to P" fix from readability.js which improves quality for smaller texts
+- Github users contributions.
diff --git a/README.rst b/README.rst
deleted file mode 100644
index 9b0a8b71..00000000
--- a/README.rst
+++ /dev/null
@@ -1,76 +0,0 @@
-.. image:: https://travis-ci.org/buriy/python-readability.svg?branch=master
- :target: https://travis-ci.org/buriy/python-readability
-.. image:: https://img.shields.io/pypi/v/readability-lxml.svg
- :target: https://pypi.python.org/pypi/readability-lxml
-
-python-readability
-==================
-
-Given an HTML document, extract and clean up the main body text and title.
-
-This is a Python port of a Ruby port of `arc90's Readability
-project `__.
-
-Installation
-------------
-
-It's easy using ``pip``, just run:
-
-.. code-block:: bash
-
- $ pip install readability-lxml
-
-As an alternative, you may also use conda to install, just run:
-
-.. code-block:: bash
-
- $ conda install -c conda-forge readability-lxml
-
-Usage
------
-
-.. code-block:: python
-
- >>> import requests
- >>> from readability import Document
-
- >>> response = requests.get('http://example.com')
- >>> doc = Document(response.content)
- >>> doc.title()
- 'Example Domain'
-
- >>> doc.summary()
- """\n
\n
Example Domain
\n
-
This domain is established to be used for illustrative examples in documents. You may
- use this\n domain in examples without prior coordination or asking for permission.
- \n
More information...
\n
- \n\n
"""
-
-Change Log
-----------
-
-- 0.8.2 Added article author(s) (thanks @mattblaha)
-- 0.8.1 Fixed processing of non-ascii HTMLs via regexps.
-- 0.8 Replaced XHTML output with HTML5 output in summary() call.
-- 0.7.1 Support for Python 3.7 . Fixed a slowdown when processing documents with lots of spaces.
-- 0.7 Improved HTML5 tags handling. Fixed stripping unwanted HTML nodes (only first matching node was removed before).
-- 0.6 Finally a release which supports Python versions 2.6, 2.7, 3.3 - 3.6
-- 0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and 3.4
-- 0.4 Added Videos loading and allowed more images per paragraph
-- 0.3 Added Document.encoding, positive\_keywords and negative\_keywords
-
-Licensing
----------
-
-This code is under `the Apache License
-2.0 `__ license.
-
-Thanks to
----------
-
-- Latest `readability.js `__
-- Ruby port by starrhorne and iterationlabs
-- `Python port `__ by gfxmonk
-- `Decruft effort ` to move to lxml
-- "BR to P" fix from readability.js which improves quality for smaller texts
-- Github users contributions.
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..4dad46a5
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,19 @@
+[tool.poetry]
+name = "readability-lxml"
+version = "0.8.4"
+description = "fast html to text parser (article readability tool) with python 3 support"
+authors = ["Yuri Baburov "]
+license = "Apache License 2.0"
+readme = "README.rst"
+
+[tool.poetry.dependencies]
+python = ">=3.8.2,<3.14"
+chardet = "^5.2.0"
+cssselect = "~1.2"
+lxml = {extras = ["html-clean"], version = "^5.4.0"}
+lxml-html-clean = {markers = "python_version < \"3.11\"", version = "^0.4.2"}
+
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
diff --git a/readability/__init__.py b/readability/__init__.py
index 18dccaeb..f27111b7 100644
--- a/readability/__init__.py
+++ b/readability/__init__.py
@@ -1,3 +1,3 @@
-__version__ = "0.8.3"
+__version__ = "0.8.4"
from .readability import Document
diff --git a/readability/cleaners.py b/readability/cleaners.py
index 69825c6b..e0b07260 100644
--- a/readability/cleaners.py
+++ b/readability/cleaners.py
@@ -1,6 +1,9 @@
# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
import re
-from lxml.html.clean import Cleaner
+try:
+ from lxml.html.clean import Cleaner
+except ImportError:
+ from lxml_html_clean import Cleaner
bad_attrs = ["width", "height", "style", "[-a-z]*color", "background[-a-z]*", "on*"]
single_quoted = "'[^']+'"
diff --git a/readability/htmls.py b/readability/htmls.py
index 87299f5a..b090aa5f 100644
--- a/readability/htmls.py
+++ b/readability/htmls.py
@@ -110,29 +110,35 @@ def shorten_title(doc):
if e.text_content():
add_match(candidates, e.text_content(), orig)
+ cjk = re.compile('[\u4e00-\u9fff]+')
+
if candidates:
title = sorted(candidates, key=len)[-1]
else:
for delimiter in [" | ", " - ", " :: ", " / "]:
if delimiter in title:
parts = orig.split(delimiter)
- if len(parts[0].split()) >= 4:
- title = parts[0]
+ p0 = parts[0]
+ pl = parts[-1]
+ if (len(p0.split()) >= 4) or (len(p0) >= 4 and cjk.search(p0)):
+ title = p0
break
- elif len(parts[-1].split()) >= 4:
- title = parts[-1]
+ elif (len(p1.split()) >= 4) or (len(p1) >= 4 and cjk.search(p1)):
+ title = p1
break
else:
if ": " in title:
- parts = orig.split(": ")
- if len(parts[-1].split()) >= 4:
- title = parts[-1]
+ p1 = orig.split(": ")[-1]
+ if (len(p1.split()) >= 4) or (len(p1) >= 4 and cjk.search(p1)):
+ title = p1
else:
title = orig.split(": ", 1)[1]
- if not 15 < len(title) < 150:
+ if cjk.search(title) and not (4 <= len(title) < 100):
return orig
-
+ elif not 15 < len(title) < 150:
+ return orig
+
return title
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 6160e33c..bc876e5a 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,8 +1,2 @@
-lxml
-lxml_html_clean
-pytest
-chardet
nose
-pep8
-coverage
-wrapt-timeout-decorator
+twine
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index d6e1198b..00000000
--- a/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
--e .
diff --git a/setup.py b/setup.py
index 1819b41a..a88e8185 100755
--- a/setup.py
+++ b/setup.py
@@ -37,8 +37,8 @@ def find_version(*file_paths):
author_email="burchik@gmail.com",
description="fast html to text parser (article readability tool) with python 3 support",
test_suite="tests.test_article_only",
- long_description=open("README.rst").read(),
- long_description_content_type='text/x-rst',
+ long_description=open("README.md").read(),
+ long_description_content_type="text/markdown",
license="Apache License 2.0",
url="http://github.com/buriy/python-readability",
packages=["readability"],