diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..b33811f --- /dev/null +++ b/.flake8 @@ -0,0 +1,2 @@ +[flake8] +ignore = E501, W503 \ No newline at end of file diff --git a/Makefile b/Makefile index ba14e4f..9caf08a 100644 --- a/Makefile +++ b/Makefile @@ -24,7 +24,7 @@ all: setup develop venv: .venv/bin/python setup: venv - $(PIP) install -r requirements-dev.txt + $(PIP) install -r requirements-dev.txt | grep -v "already satisfied" || true .venv/bin/python: test -d .venv || which python3 && python3 -m venv .venv || virtualenv .venv @@ -45,6 +45,10 @@ develop: .venv/lib/python*/site-packages/readability-lxml.egg-link .PHONY: clean_all clean_all: clean_venv +.PHONY: build +build: + poetry build + # ########### # Deploy # ########### @@ -61,3 +65,9 @@ upload: .PHONY: bump bump: $(EDITOR) readability/__init__.py + $(eval VERSION := $(shell grep "__version__" readability/__init__.py | cut -d'"' -f2)) + # fix first occurrence of version in pyproject.toml + sed -i '0,/version = ".*"/s//version = "$(VERSION)"/' pyproject.toml + git commit -m "Bump version to $(VERSION)" pyproject.toml readability/__init__.py + git tag $(VERSION) + git push --tags diff --git a/pyproject.toml b/pyproject.toml index 4dad46a..4499285 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,15 +1,21 @@ [tool.poetry] name = "readability-lxml" -version = "0.8.4" +version = "0.8.4.1" description = "fast html to text parser (article readability tool) with python 3 support" authors = ["Yuri Baburov "] license = "Apache License 2.0" -readme = "README.rst" +readme = "README.md" +packages = [ + { include = "readability" }, +] [tool.poetry.dependencies] python = ">=3.8.2,<3.14" chardet = "^5.2.0" -cssselect = "~1.2" +cssselect = [ + { version = "~1.2", markers = "python_version < '3.9'" }, + { version = "~1.3", markers = "python_version >= '3.9'" } +] lxml = {extras = ["html-clean"], version = "^5.4.0"} lxml-html-clean = {markers = "python_version < \"3.11\"", version = "^0.4.2"} diff --git a/readability/__init__.py b/readability/__init__.py index f27111b..b36f021 100644 --- a/readability/__init__.py +++ b/readability/__init__.py @@ -1,3 +1,3 @@ -__version__ = "0.8.4" +__version__ = "0.8.4.1" from .readability import Document diff --git a/readability/encoding.py b/readability/encoding.py index c95cc14..08332df 100644 --- a/readability/encoding.py +++ b/readability/encoding.py @@ -1,9 +1,8 @@ import re try: - import cchardet + import cchardet as chardet except ImportError: import chardet -import sys RE_CHARSET = re.compile(r']', flags=re.I) diff --git a/readability/htmls.py b/readability/htmls.py index b090aa5..d99a9f5 100644 --- a/readability/htmls.py +++ b/readability/htmls.py @@ -123,8 +123,8 @@ def shorten_title(doc): if (len(p0.split()) >= 4) or (len(p0) >= 4 and cjk.search(p0)): title = p0 break - elif (len(p1.split()) >= 4) or (len(p1) >= 4 and cjk.search(p1)): - title = p1 + elif (len(pl.split()) >= 4) or (len(pl) >= 4 and cjk.search(pl)): + title = pl break else: if ": " in title: @@ -134,11 +134,12 @@ def shorten_title(doc): else: title = orig.split(": ", 1)[1] - if cjk.search(title) and not (4 <= len(title) < 100): - return orig + if cjk.search(title): + if not (4 <= len(title) < 100): # Allow length >= 4, cap at 100 + return orig elif not 15 < len(title) < 150: return orig - + return title diff --git a/readability/readability.py b/readability/readability.py index 286841c..c573905 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -42,11 +42,11 @@ "divToPElementsRe": re.compile( r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)", re.I ), - #'replaceBrsRe': re.compile(r'(]*>[ \n\r\t]*){2,}',re.I), - #'replaceFontsRe': re.compile(r'<(\/?)font[^>]*>',re.I), - #'trimRe': re.compile(r'^\s+|\s+$/'), - #'normalizeRe': re.compile(r'\s{2,}/'), - #'killBreaksRe': re.compile(r'((\s| ?)*){1,}/'), + # 'replaceBrsRe': re.compile(r'(]*>[ \n\r\t]*){2,}',re.I), + # 'replaceFontsRe': re.compile(r'<(\/?)font[^>]*>',re.I), + # 'trimRe': re.compile(r'^\s+|\s+$/'), + # 'normalizeRe': re.compile(r'\s{2,}/'), + # 'killBreaksRe': re.compile(r'((\s| ?)*){1,}/'), "videoRe": re.compile(r"https?:\/\/(www\.)?(youtube|vimeo)\.com", re.I), # skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i, } diff --git a/requirements-dev.txt b/requirements-dev.txt index bc876e5..996bbfc 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,2 +1,3 @@ nose -twine \ No newline at end of file +twine +flake8 \ No newline at end of file diff --git a/tests/test_article_only.py b/tests/test_article_only.py index d6cef52..fe32212 100644 --- a/tests/test_article_only.py +++ b/tests/test_article_only.py @@ -149,6 +149,7 @@ def test_utf8_kanji(self): sample = load_sample("utf-8-kanji.sample.html") doc = Document(sample) res = doc.summary() + assert 0 < len(res) < 10000 def test_author_present(self): sample = load_sample("the-hurricane-rubin-carter-denzel-washington.html") @@ -180,3 +181,53 @@ def test_keep_images_absent_by_defautl(self): doc = Document(sample) assert " + + 这是标题 + + +
一些无关紧要的内容
+
+

主要文章标题

+

这是主要内容的第一段。

+

これはコンテンツの第2段落です。

+

이것은 콘텐츠의 세 번째 단락입니다.

+

This is the fourth paragraph.

+
+
More irrelevant stuff
+ + + """ + doc = Document(html) + summary = doc.summary() + # Check that the main CJK content is present in the summary + self.assertTrue("这是主要内容的第一段" in summary) + self.assertTrue("これはコンテンツの第2段落です" in summary) + self.assertTrue("이것은 콘텐츠의 세 번째 단락입니다" in summary) + # Check that irrelevant content is mostly gone + self.assertFalse("一些无关紧要的内容" in summary) + + def test_shorten_title_delimiter_bug(self): + """Test that shorten_title handles delimiters correctly when the last part is valid. + + This specifically targets a potential bug where 'p1' might be used instead of 'pl'. + """ + html = """ + + + Short Part | これは長いです + + +
Content
+ + + """ + doc = Document(html) + # With the bug, this call might raise NameError: name 'p1' is not defined + # With the fix, it should correctly return the last part. + short_title = doc.short_title() + self.assertEqual(short_title, "これは長いです")