From c1574456f5aefc1dc05d7def332c48e3799e214c Mon Sep 17 00:00:00 2001
From: cdhigh <cdhigh@users.noreply.github.com>
Date: Thu, 1 May 2025 10:37:30 -0300
Subject: [PATCH 1/6] shorten_title supports CJK character sets.

---
 readability/htmls.py | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/readability/htmls.py b/readability/htmls.py
index 87299f5..b090aa5 100644
--- a/readability/htmls.py
+++ b/readability/htmls.py
@@ -110,29 +110,35 @@ def shorten_title(doc):
             if e.text_content():
                 add_match(candidates, e.text_content(), orig)
 
+    cjk = re.compile('[\u4e00-\u9fff]+')
+
     if candidates:
         title = sorted(candidates, key=len)[-1]
     else:
         for delimiter in [" | ", " - ", " :: ", " / "]:
             if delimiter in title:
                 parts = orig.split(delimiter)
-                if len(parts[0].split()) >= 4:
-                    title = parts[0]
+                p0 = parts[0]
+                pl = parts[-1]
+                if (len(p0.split()) >= 4) or (len(p0) >= 4 and cjk.search(p0)):
+                    title = p0
                     break
-                elif len(parts[-1].split()) >= 4:
-                    title = parts[-1]
+                elif (len(p1.split()) >= 4) or (len(p1) >= 4 and cjk.search(p1)):
+                    title = p1
                     break
         else:
             if ": " in title:
-                parts = orig.split(": ")
-                if len(parts[-1].split()) >= 4:
-                    title = parts[-1]
+                p1 = orig.split(": ")[-1]
+                if (len(p1.split()) >= 4) or (len(p1) >= 4 and cjk.search(p1)):
+                    title = p1
                 else:
                     title = orig.split(": ", 1)[1]
 
-    if not 15 < len(title) < 150:
+    if cjk.search(title) and not (4 <= len(title) < 100):
         return orig
-
+    elif not 15 < len(title) < 150:
+        return orig
+    
     return title
 
 

From 16ce81dd89bf25b179dced79070fb933857e5dc6 Mon Sep 17 00:00:00 2001
From: cdhigh <cdhigh@users.noreply.github.com>
Date: Thu, 1 May 2025 10:47:50 -0300
Subject: [PATCH 2/6] Update cleaners.py

---
 readability/cleaners.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/readability/cleaners.py b/readability/cleaners.py
index 69825c6..e0b0726 100644
--- a/readability/cleaners.py
+++ b/readability/cleaners.py
@@ -1,6 +1,9 @@
 # strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
 import re
-from lxml.html.clean import Cleaner
+try:
+    from lxml.html.clean import Cleaner
+except ImportError:
+    from lxml_html_clean import Cleaner
 
 bad_attrs = ["width", "height", "style", "[-a-z]*color", "background[-a-z]*", "on*"]
 single_quoted = "'[^']+'"

From f02d865bc4afc435cc02224a6915494a33abe629 Mon Sep 17 00:00:00 2001
From: Yuri Baburov <burchik@gmail.com>
Date: Sat, 3 May 2025 18:39:26 +0700
Subject: [PATCH 3/6] Added nose to requirements-dev so "make test" will work
 again.

---
 requirements-dev.txt | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index 6160e33..9f580cb 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,8 +1 @@
-lxml
-lxml_html_clean
-pytest
-chardet
-nose
-pep8
-coverage
-wrapt-timeout-decorator
+nose
\ No newline at end of file

From 6f1b449962fe577e8d695d3147a9fbc21b9bd333 Mon Sep 17 00:00:00 2001
From: Yuri Baburov <burchik@gmail.com>
Date: Sat, 3 May 2025 18:49:08 +0700
Subject: [PATCH 4/6] Better CJK support (and fix for lxml-clean), thanks
 @cdhigh

---
 .gitignore              |  3 ++-
 Makefile                |  7 ++++---
 README.rst              |  4 +++-
 pyproject.toml          | 19 +++++++++++++++++++
 readability/__init__.py |  2 +-
 5 files changed, 29 insertions(+), 6 deletions(-)
 create mode 100644 pyproject.toml

diff --git a/.gitignore b/.gitignore
index d896106..b532e65 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,4 +14,5 @@ nosetests.xml
 .idea
 .cache
 /.noseids
-/.venv
\ No newline at end of file
+/.venv
+/poetry.lock
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 012e4b7..f1c8f21 100644
--- a/Makefile
+++ b/Makefile
@@ -50,6 +50,7 @@ clean_all: clean_venv
 # ###########
 .PHONY: dist
 dist:
+	$(PY) -m pip install wheel
 	$(PY) setup.py sdist bdist_wheel
 	$(TWINE) check dist/*
 
@@ -57,6 +58,6 @@ dist:
 upload:
 	$(TWINE) upload dist/*
 
-.PHONY: version_update
-version_update:
-	$(EDITOR) setup.py
+.PHONY: bump
+bump:
+	$(EDITOR) readability/__init__.py
diff --git a/README.rst b/README.rst
index 9b0a8b7..72b4e63 100644
--- a/README.rst
+++ b/README.rst
@@ -48,7 +48,9 @@ Usage
 
 Change Log
 ----------
-
+-  0.8.4 Better CJK support, thanks @cdhigh
+-  0.8.3.1 Support for python 3.8 - 3.13
+-  0.8.3 We can now save all images via keep_all_images=True (default is to save 1 main image), thanks @botlabsDev
 -  0.8.2 Added article author(s) (thanks @mattblaha)
 -  0.8.1 Fixed processing of non-ascii HTMLs via regexps.
 -  0.8 Replaced XHTML output with HTML5 output in summary() call.
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..4dad46a
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,19 @@
+[tool.poetry]
+name = "readability-lxml"
+version = "0.8.4"
+description = "fast html to text parser (article readability tool) with python 3 support"
+authors = ["Yuri Baburov <burchik@gmail.com>"]
+license = "Apache License 2.0"
+readme = "README.rst"
+
+[tool.poetry.dependencies]
+python = ">=3.8.2,<3.14"
+chardet = "^5.2.0"
+cssselect = "~1.2"
+lxml = {extras = ["html-clean"], version = "^5.4.0"}
+lxml-html-clean = {markers = "python_version < \"3.11\"", version = "^0.4.2"}
+
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
diff --git a/readability/__init__.py b/readability/__init__.py
index 18dccae..f27111b 100644
--- a/readability/__init__.py
+++ b/readability/__init__.py
@@ -1,3 +1,3 @@
-__version__ = "0.8.3"
+__version__ = "0.8.4"
 
 from .readability import Document

From 344ba9e7c4839019af1d6aace030a9425eeb06cf Mon Sep 17 00:00:00 2001
From: Yuri Baburov <burchik@gmail.com>
Date: Sat, 3 May 2025 18:49:08 +0700
Subject: [PATCH 5/6] Better CJK support (and fix for lxml-clean), thanks
 @cdhigh

---
 requirements.txt | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index d6e1198..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
--e .

From be72501fec6d4924ca97cdfccfa03eaad57cc249 Mon Sep 17 00:00:00 2001
From: Yuri Baburov <burchik@gmail.com>
Date: Sat, 3 May 2025 19:11:41 +0700
Subject: [PATCH 6/6] Updates for publishing.

---
 Makefile             |  2 +-
 README.md            | 67 +++++++++++++++++++++++++++++++++++++
 README.rst           | 78 --------------------------------------------
 requirements-dev.txt |  3 +-
 setup.py             |  4 +--
 5 files changed, 72 insertions(+), 82 deletions(-)
 create mode 100644 README.md
 delete mode 100644 README.rst

diff --git a/Makefile b/Makefile
index f1c8f21..ba14e4f 100644
--- a/Makefile
+++ b/Makefile
@@ -4,7 +4,7 @@ PY := .venv/bin/python
 PIP := .venv/bin/pip
 PEP8 := .venv/bin/pep8
 NOSE := .venv/bin/nosetests
-TWINE := twine
+TWINE := .venv/bin/twine
 
 # ###########
 # Tests rule!
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..e09a515
--- /dev/null
+++ b/README.md
@@ -0,0 +1,67 @@
+[![PyPI version](https://img.shields.io/pypi/v/readability-lxml.svg)](https://pypi.python.org/pypi/readability-lxml)
+
+# python-readability
+
+Given an HTML document, extract and clean up the main body text and title.
+
+This is a Python port of a Ruby port of [arc90's Readability project](https://web.archive.org/web/20130519040221/http://www.readability.com/).
+
+## Installation
+
+It's easy using `pip`, just run:
+
+```bash
+$ pip install readability-lxml
+```
+
+As an alternative, you may also use conda to install, just run:
+
+```bash
+$ conda install -c conda-forge readability-lxml
+```
+
+## Usage
+
+```python
+>>> import requests
+>>> from readability import Document
+
+>>> response = requests.get('http://example.com')
+>>> doc = Document(response.content)
+>>> doc.title()
+'Example Domain'
+
+>>> doc.summary()
+"""<html><body><div><body id="readabilityBody">\n<div>\n    <h1>Example Domain</h1>\n
+<p>This domain is established to be used for illustrative examples in documents. You may
+use this\n    domain in examples without prior coordination or asking for permission.</p>
+\n    <p><a href="http://www.iana.org/domains/example">More information...</a></p>\n</div>
+\n</body>\n</div></body></html>"""
+```
+
+## Change Log
+- 0.8.4 Better CJK support, thanks @cdhigh
+- 0.8.3.1 Support for python 3.8 - 3.13
+- 0.8.3 We can now save all images via keep_all_images=True (default is to save 1 main image), thanks @botlabsDev
+- 0.8.2 Added article author(s) (thanks @mattblaha)
+- 0.8.1 Fixed processing of non-ascii HTMLs via regexps.
+- 0.8 Replaced XHTML output with HTML5 output in summary() call.
+- 0.7.1 Support for Python 3.7 . Fixed a slowdown when processing documents with lots of spaces.
+- 0.7 Improved HTML5 tags handling. Fixed stripping unwanted HTML nodes (only first matching node was removed before).
+- 0.6 Finally a release which supports Python versions 2.6, 2.7, 3.3 - 3.6
+- 0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and 3.4
+- 0.4 Added Videos loading and allowed more images per paragraph
+- 0.3 Added Document.encoding, positive\_keywords and negative\_keywords
+
+## Licensing
+
+This code is under [the Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0) license.
+
+## Thanks to
+
+- Latest [readability.js](https://github.com/MHordecki/readability-redux/blob/master/readability/readability.js)
+- Ruby port by starrhorne and iterationlabs
+- [Python port](https://github.com/gfxmonk/python-readability) by gfxmonk
+- [Decruft effort](https://web.archive.org/web/20110214150709/https://www.minvolai.com/blog/decruft-arc90s-readability-in-python/) to move to lxml
+- "BR to P" fix from readability.js which improves quality for smaller texts
+- Github users contributions.
diff --git a/README.rst b/README.rst
deleted file mode 100644
index 72b4e63..0000000
--- a/README.rst
+++ /dev/null
@@ -1,78 +0,0 @@
-.. image:: https://travis-ci.org/buriy/python-readability.svg?branch=master
-    :target: https://travis-ci.org/buriy/python-readability
-.. image:: https://img.shields.io/pypi/v/readability-lxml.svg
-    :target: https://pypi.python.org/pypi/readability-lxml
-
-python-readability
-==================
-
-Given an HTML document, extract and clean up the main body text and title.
-
-This is a Python port of a Ruby port of `arc90's Readability
-project <https://web.archive.org/web/20130519040221/http://www.readability.com/>`__.
-
-Installation
-------------
-
-It's easy using ``pip``, just run:
-
-.. code-block:: bash
-
-    $ pip install readability-lxml
-
-As an alternative, you may also use conda to install, just run:
-
-.. code-block:: bash
-
-    $ conda install -c conda-forge readability-lxml 
-
-Usage
------
-
-.. code-block:: python
-
-    >>> import requests
-    >>> from readability import Document
-
-    >>> response = requests.get('http://example.com')
-    >>> doc = Document(response.content)
-    >>> doc.title()
-    'Example Domain'
-
-    >>> doc.summary()
-    """<html><body><div><body id="readabilityBody">\n<div>\n    <h1>Example Domain</h1>\n
-    <p>This domain is established to be used for illustrative examples in documents. You may
-    use this\n    domain in examples without prior coordination or asking for permission.</p>
-    \n    <p><a href="http://www.iana.org/domains/example">More information...</a></p>\n</div>
-    \n</body>\n</div></body></html>"""
-
-Change Log
-----------
--  0.8.4 Better CJK support, thanks @cdhigh
--  0.8.3.1 Support for python 3.8 - 3.13
--  0.8.3 We can now save all images via keep_all_images=True (default is to save 1 main image), thanks @botlabsDev
--  0.8.2 Added article author(s) (thanks @mattblaha)
--  0.8.1 Fixed processing of non-ascii HTMLs via regexps.
--  0.8 Replaced XHTML output with HTML5 output in summary() call.
--  0.7.1 Support for Python 3.7 . Fixed a slowdown when processing documents with lots of spaces.
--  0.7 Improved HTML5 tags handling. Fixed stripping unwanted HTML nodes (only first matching node was removed before).
--  0.6 Finally a release which supports Python versions 2.6, 2.7, 3.3 - 3.6
--  0.5 Preparing a release to support Python versions 2.6, 2.7, 3.3 and 3.4
--  0.4 Added Videos loading and allowed more images per paragraph
--  0.3 Added Document.encoding, positive\_keywords and negative\_keywords
-
-Licensing
----------
-
-This code is under `the Apache License
-2.0 <http://www.apache.org/licenses/LICENSE-2.0>`__ license.
-
-Thanks to
----------
-
--  Latest `readability.js <https://github.com/MHordecki/readability-redux/blob/master/readability/readability.js>`__
--  Ruby port by starrhorne and iterationlabs
--  `Python port <https://github.com/gfxmonk/python-readability>`__ by gfxmonk
--  `Decruft effort <https://web.archive.org/web/20110214150709/https://www.minvolai.com/blog/decruft-arc90s-readability-in-python/>` to move to lxml
--  "BR to P" fix from readability.js which improves quality for smaller texts
--  Github users contributions.
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 9f580cb..bc876e5 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1 +1,2 @@
-nose
\ No newline at end of file
+nose
+twine
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 1819b41..a88e818 100755
--- a/setup.py
+++ b/setup.py
@@ -37,8 +37,8 @@ def find_version(*file_paths):
     author_email="burchik@gmail.com",
     description="fast html to text parser (article readability tool) with python 3 support",
     test_suite="tests.test_article_only",
-    long_description=open("README.rst").read(),
-    long_description_content_type='text/x-rst',
+    long_description=open("README.md").read(),
+    long_description_content_type="text/markdown",
     license="Apache License 2.0",
     url="http://github.com/buriy/python-readability",
     packages=["readability"],