Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[flake8]
ignore = E501, W503
12 changes: 11 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ all: setup develop
venv: .venv/bin/python

setup: venv
$(PIP) install -r requirements-dev.txt
$(PIP) install -r requirements-dev.txt | grep -v "already satisfied" || true

.venv/bin/python:
test -d .venv || which python3 && python3 -m venv .venv || virtualenv .venv
Expand All @@ -45,6 +45,10 @@ develop: .venv/lib/python*/site-packages/readability-lxml.egg-link
.PHONY: clean_all
clean_all: clean_venv

.PHONY: build
build:
poetry build

# ###########
# Deploy
# ###########
Expand All @@ -61,3 +65,9 @@ upload:
.PHONY: bump
bump:
$(EDITOR) readability/__init__.py
$(eval VERSION := $(shell grep "__version__" readability/__init__.py | cut -d'"' -f2))
# fix first occurrence of version in pyproject.toml
sed -i '0,/version = ".*"/s//version = "$(VERSION)"/' pyproject.toml
git commit -m "Bump version to $(VERSION)" pyproject.toml readability/__init__.py
git tag $(VERSION)
git push --tags
12 changes: 9 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,15 +1,21 @@
[tool.poetry]
name = "readability-lxml"
version = "0.8.4"
version = "0.8.4.1"
description = "fast html to text parser (article readability tool) with python 3 support"
authors = ["Yuri Baburov <burchik@gmail.com>"]
license = "Apache License 2.0"
readme = "README.rst"
readme = "README.md"
packages = [
{ include = "readability" },
]

[tool.poetry.dependencies]
python = ">=3.8.2,<3.14"
chardet = "^5.2.0"
cssselect = "~1.2"
cssselect = [
{ version = "~1.2", markers = "python_version < '3.9'" },
{ version = "~1.3", markers = "python_version >= '3.9'" }
]
lxml = {extras = ["html-clean"], version = "^5.4.0"}
lxml-html-clean = {markers = "python_version < \"3.11\"", version = "^0.4.2"}

Expand Down
2 changes: 1 addition & 1 deletion readability/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
__version__ = "0.8.4"
__version__ = "0.8.4.1"

from .readability import Document
3 changes: 1 addition & 2 deletions readability/encoding.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import re
try:
import cchardet
import cchardet as chardet
except ImportError:
import chardet
import sys


RE_CHARSET = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
Expand Down
11 changes: 6 additions & 5 deletions readability/htmls.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,8 @@ def shorten_title(doc):
if (len(p0.split()) >= 4) or (len(p0) >= 4 and cjk.search(p0)):
title = p0
break
elif (len(p1.split()) >= 4) or (len(p1) >= 4 and cjk.search(p1)):
title = p1
elif (len(pl.split()) >= 4) or (len(pl) >= 4 and cjk.search(pl)):
title = pl
break
else:
if ": " in title:
Expand All @@ -134,11 +134,12 @@ def shorten_title(doc):
else:
title = orig.split(": ", 1)[1]

if cjk.search(title) and not (4 <= len(title) < 100):
return orig
if cjk.search(title):
if not (4 <= len(title) < 100): # Allow length >= 4, cap at 100
return orig
elif not 15 < len(title) < 150:
return orig

return title


Expand Down
10 changes: 5 additions & 5 deletions readability/readability.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,11 @@
"divToPElementsRe": re.compile(
r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)", re.I
),
#'replaceBrsRe': re.compile(r'(<br[^>]*>[ \n\r\t]*){2,}',re.I),
#'replaceFontsRe': re.compile(r'<(\/?)font[^>]*>',re.I),
#'trimRe': re.compile(r'^\s+|\s+$/'),
#'normalizeRe': re.compile(r'\s{2,}/'),
#'killBreaksRe': re.compile(r'(<br\s*\/?>(\s|&nbsp;?)*){1,}/'),
# 'replaceBrsRe': re.compile(r'(<br[^>]*>[ \n\r\t]*){2,}',re.I),
# 'replaceFontsRe': re.compile(r'<(\/?)font[^>]*>',re.I),
# 'trimRe': re.compile(r'^\s+|\s+$/'),
# 'normalizeRe': re.compile(r'\s{2,}/'),
# 'killBreaksRe': re.compile(r'(<br\s*\/?>(\s|&nbsp;?)*){1,}/'),
"videoRe": re.compile(r"https?:\/\/(www\.)?(youtube|vimeo)\.com", re.I),
# skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
}
Expand Down
3 changes: 2 additions & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
nose
twine
twine
flake8
51 changes: 51 additions & 0 deletions tests/test_article_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ def test_utf8_kanji(self):
sample = load_sample("utf-8-kanji.sample.html")
doc = Document(sample)
res = doc.summary()
assert 0 < len(res) < 10000

def test_author_present(self):
sample = load_sample("the-hurricane-rubin-carter-denzel-washington.html")
Expand Down Expand Up @@ -180,3 +181,53 @@ def test_keep_images_absent_by_defautl(self):
doc = Document(sample)

assert "<img" not in doc.summary()

def test_cjk_summary(self):
"""Check we can extract CJK text correctly."""
html = """
<html>
<head>
<title>这是标题</title>
</head>
<body>
<div>一些无关紧要的内容</div>
<div class="article-content">
<h1>主要文章标题</h1>
<p>这是主要内容的第一段。</p>
<p>これはコンテンツの第2段落です。</p>
<p>이것은 콘텐츠의 세 번째 단락입니다.</p>
<p>This is the fourth paragraph.</p>
</div>
<div>More irrelevant stuff</div>
</body>
</html>
"""
doc = Document(html)
summary = doc.summary()
# Check that the main CJK content is present in the summary
self.assertTrue("这是主要内容的第一段" in summary)
self.assertTrue("これはコンテンツの第2段落です" in summary)
self.assertTrue("이것은 콘텐츠의 세 번째 단락입니다" in summary)
# Check that irrelevant content is mostly gone
self.assertFalse("一些无关紧要的内容" in summary)

def test_shorten_title_delimiter_bug(self):
"""Test that shorten_title handles delimiters correctly when the last part is valid.

This specifically targets a potential bug where 'p1' might be used instead of 'pl'.
"""
html = """
<html>
<head>
<title>Short Part | これは長いです</title>
</head>
<body>
<div>Content</div>
</body>
</html>
"""
doc = Document(html)
# With the bug, this call might raise NameError: name 'p1' is not defined
# With the fix, it should correctly return the last part.
short_title = doc.short_title()
self.assertEqual(short_title, "これは長いです")
Loading