zanachka · pull · May 4, 2025 · May 3, 2025 · May 3, 2025 · May 3, 2025
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,2 @@
+[flake8]
+ignore = E501, W503 
diff --git a/Makefile b/Makefile
@@ -24,7 +24,7 @@ all: setup develop
 venv: .venv/bin/python
 
 setup: venv
-	$(PIP) install -r requirements-dev.txt
+	$(PIP) install -r requirements-dev.txt | grep -v "already satisfied" || true
 
 .venv/bin/python:
 	test -d .venv || which python3 && python3 -m venv .venv || virtualenv .venv
@@ -45,6 +45,10 @@ develop: .venv/lib/python*/site-packages/readability-lxml.egg-link
 .PHONY: clean_all
 clean_all: clean_venv
 
+.PHONY: build
+build:
+	poetry build
+
 # ###########
 # Deploy
 # ###########
@@ -61,3 +65,9 @@ upload:
 .PHONY: bump
 bump:
 	$(EDITOR) readability/__init__.py
+	$(eval VERSION := $(shell grep "__version__" readability/__init__.py | cut -d'"' -f2))
+	# fix first occurrence of version in pyproject.toml
+	sed -i '0,/version = ".*"/s//version = "$(VERSION)"/' pyproject.toml
+	git commit -m "Bump version to $(VERSION)" pyproject.toml readability/__init__.py
+	git tag $(VERSION)
+	git push --tags
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,15 +1,21 @@
 [tool.poetry]
 name = "readability-lxml"
-version = "0.8.4"
+version = "0.8.4.1"
 description = "fast html to text parser (article readability tool) with python 3 support"
 authors = ["Yuri Baburov <burchik@gmail.com>"]
 license = "Apache License 2.0"
-readme = "README.rst"
+readme = "README.md"
+packages = [
+    { include = "readability" },
+]
 
 [tool.poetry.dependencies]
 python = ">=3.8.2,<3.14"
 chardet = "^5.2.0"
-cssselect = "~1.2"
+cssselect = [
+    { version = "~1.2", markers = "python_version < '3.9'" },
+    { version = "~1.3", markers = "python_version >= '3.9'" }
+]
 lxml = {extras = ["html-clean"], version = "^5.4.0"}
 lxml-html-clean = {markers = "python_version < \"3.11\"", version = "^0.4.2"}
 

diff --git a/readability/__init__.py b/readability/__init__.py
@@ -1,3 +1,3 @@
-__version__ = "0.8.4"
+__version__ = "0.8.4.1"
 
 from .readability import Document
diff --git a/readability/encoding.py b/readability/encoding.py
@@ -1,9 +1,8 @@
 import re
 try:
-    import cchardet
+    import cchardet as chardet
 except ImportError:
     import chardet
-import sys
 
 
 RE_CHARSET = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)

diff --git a/readability/htmls.py b/readability/htmls.py
@@ -123,8 +123,8 @@ def shorten_title(doc):
                 if (len(p0.split()) >= 4) or (len(p0) >= 4 and cjk.search(p0)):
                     title = p0
                     break
-                elif (len(p1.split()) >= 4) or (len(p1) >= 4 and cjk.search(p1)):
-                    title = p1
+                elif (len(pl.split()) >= 4) or (len(pl) >= 4 and cjk.search(pl)):
+                    title = pl
                     break
         else:
             if ": " in title:
@@ -134,11 +134,12 @@ def shorten_title(doc):
                 else:
                     title = orig.split(": ", 1)[1]
 
-    if cjk.search(title) and not (4 <= len(title) < 100):
-        return orig
+    if cjk.search(title):
+        if not (4 <= len(title) < 100):  # Allow length >= 4, cap at 100
+            return orig
     elif not 15 < len(title) < 150:
         return orig
-    
+
     return title
 
 

diff --git a/readability/readability.py b/readability/readability.py
@@ -42,11 +42,11 @@
     "divToPElementsRe": re.compile(
         r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)", re.I
     ),
-    #'replaceBrsRe': re.compile(r'(<br[^>]*>[ \n\r\t]*){2,}',re.I),
-    #'replaceFontsRe': re.compile(r'<(\/?)font[^>]*>',re.I),
-    #'trimRe': re.compile(r'^\s+|\s+$/'),
-    #'normalizeRe': re.compile(r'\s{2,}/'),
-    #'killBreaksRe': re.compile(r'(<br\s*\/?>(\s|&nbsp;?)*){1,}/'),
+    # 'replaceBrsRe': re.compile(r'(<br[^>]*>[ \n\r\t]*){2,}',re.I),
+    # 'replaceFontsRe': re.compile(r'<(\/?)font[^>]*>',re.I),
+    # 'trimRe': re.compile(r'^\s+|\s+$/'),
+    # 'normalizeRe': re.compile(r'\s{2,}/'),
+    # 'killBreaksRe': re.compile(r'(<br\s*\/?>(\s|&nbsp;?)*){1,}/'),
     "videoRe": re.compile(r"https?:\/\/(www\.)?(youtube|vimeo)\.com", re.I),
     # skipFootnoteLink:      /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
 }

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,2 +1,3 @@
 nose
-twine
+twine
+flake8
diff --git a/tests/test_article_only.py b/tests/test_article_only.py
@@ -149,6 +149,7 @@ def test_utf8_kanji(self):
         sample = load_sample("utf-8-kanji.sample.html")
         doc = Document(sample)
         res = doc.summary()
+        assert 0 < len(res) < 10000
 
     def test_author_present(self):
         sample = load_sample("the-hurricane-rubin-carter-denzel-washington.html")
@@ -180,3 +181,53 @@ def test_keep_images_absent_by_defautl(self):
         doc = Document(sample)
 
         assert "<img" not in doc.summary()
+
+    def test_cjk_summary(self):
+        """Check we can extract CJK text correctly."""
+        html = """
+        <html>
+            <head>
+                <title>这是标题</title>
+            </head>
+            <body>
+                <div>一些无关紧要的内容</div>
+                <div class="article-content">
+                    <h1>主要文章标题</h1>
+                    <p>这是主要内容的第一段。</p>
+                    <p>これはコンテンツの第2段落です。</p>
+                    <p>이것은 콘텐츠의 세 번째 단락입니다.</p>
+                    <p>This is the fourth paragraph.</p>
+                </div>
+                <div>More irrelevant stuff</div>
+            </body>
+        </html>
+        """
+        doc = Document(html)
+        summary = doc.summary()
+        # Check that the main CJK content is present in the summary
+        self.assertTrue("这是主要内容的第一段" in summary)
+        self.assertTrue("これはコンテンツの第2段落です" in summary)
+        self.assertTrue("이것은 콘텐츠의 세 번째 단락입니다" in summary)
+        # Check that irrelevant content is mostly gone
+        self.assertFalse("一些无关紧要的内容" in summary)
+
+    def test_shorten_title_delimiter_bug(self):
+        """Test that shorten_title handles delimiters correctly when the last part is valid.
+
+        This specifically targets a potential bug where 'p1' might be used instead of 'pl'.
+        """
+        html = """
+        <html>
+            <head>
+                <title>Short Part | これは長いです</title>
+            </head>
+            <body>
+                <div>Content</div>
+            </body>
+        </html>
+        """
+        doc = Document(html)
+        # With the bug, this call might raise NameError: name 'p1' is not defined
+        # With the fix, it should correctly return the last part.
+        short_title = doc.short_title()
+        self.assertEqual(short_title, "これは長いです")
-Original file line number
+Diff line change
@@ -1,2 +1,3 @@
     nose
-    twine
+    twine
+    flake8