Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python

name: Python package

on:
push:
branches: [ "master" ]
pull_request:
branches: [ "master" ]

jobs:
build:

runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]

steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install flake8 pytest
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
pytest
11 changes: 6 additions & 5 deletions readability/readability.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,12 +210,13 @@ def get_clean_html(self):
"""
return clean_attributes(tounicode(self.html, method="html"))

def summary(self, html_partial=False):
def summary(self, html_partial=False, keep_all_images=False):
"""
Given a HTML file, extracts the text of the article.

:param html_partial: return only the div of the document, don't wrap
in html and body tags.
:param keep_all_images: Keep all images in summary.

Warning: It mutates internal DOM representation of the HTML document,
so it is better to call other API methods before this one.
Expand Down Expand Up @@ -257,7 +258,7 @@ def summary(self, html_partial=False):
article = self.html.find("body")
if article is None:
article = self.html
cleaned_article = self.sanitize(article, candidates)
cleaned_article = self.sanitize(article, candidates, keep_all_images)

article_length = len(cleaned_article or "")
retry_length = self.retry_length
Expand Down Expand Up @@ -502,7 +503,7 @@ def reverse_tags(self, node, *tag_names):
for tag_name in tag_names:
yield from reversed(node.findall(".//%s" % tag_name))

def sanitize(self, node, candidates):
def sanitize(self, node, candidates, keep_all_images=False):
MIN_LEN = self.min_text_length
for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33:
Expand Down Expand Up @@ -563,8 +564,8 @@ def sanitize(self, node, candidates):
to_remove = False
reason = ""

# if el.tag == 'div' and counts["img"] >= 1:
# continue
if keep_all_images and el.tag == 'div' and counts["img"] >= 1:
continue
if counts["p"] and counts["img"] > 1 + counts["p"] * 1.3:
reason = "too many images (%s)" % counts["img"]
to_remove = True
Expand Down
2 changes: 2 additions & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
lxml
lxml_html_clean
pytest
chardet
nose
pep8
Expand Down
29 changes: 29 additions & 0 deletions tests/samples/summary-keep-all-images.sample.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
<!DOCTYPE html>
<html lang="en">
<head></head>
<body>
<h2>
<span>
H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline
</span>
</h2>
<p>
<spa>
Text Text Text Text Text Text Text Text Text Text
</spa>
</p>
<div>
<span>
<a>
<img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAIAAAACUFjqAAABhGlDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw1AUhU9TpSLVDnYQcchQnSyIFXHUKhShQqgVWnUweekfNDEkKS6OgmvBwZ/FqoOLs64OroIg+APiLjgpukiJ9yWFFjFeeLyP8+45vHcfIDSqTLO6xgFNt81MKinm8iti6BUBhBFBPxIys4xZSUrDt77uqZvqLs6z/Pv+rD61YDEgIBLPMMO0ideJpzZtg/M+cZSVZZX4nHjMpAsSP3Jd8fiNc8llgWdGzWxmjjhKLJY6WOlgVjY14knimKrplC/kPFY5b3HWqjXWuid/YbigLy9xndYwUljAIiSIUFBDBVXYiNOuk2IhQ+dJH/+Q65fIpZCrAkaOeWxAg+z6wf/g92ytYmLCSwonge4Xx/kYAUK7QLPuON/HjtM8AYLPwJXe9m80gOlP0uttLXYERLaBi+u2puwBlzvA4JMhm7IrBWkJxSLwfkbflAcGboHeVW9urXOcPgBZmlX6Bjg4BEZLlL3m8+6ezrn929Oa3w9e03KfJqsuOAAAAAlwSFlzAAAuIwAALiMBeKU/dgAAAAd0SU1FB+kBDA8PKt1W5MYAAAAZdEVYdENvbW1lbnQAQ3JlYXRlZCB3aXRoIEdJTVBXgQ4XAAAAFUlEQVQY02P8x+rFgBswMeAFI1UaAJ65AWFYB2G5AAAAAElFTkSuQmCC"
/>
</a>
</span>
</div>
<p>
<spa>
Text Text Text Text Text Text Text Text Text Text
</spa>
</p>
</body>
</html>
21 changes: 21 additions & 0 deletions tests/test_article_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,24 @@ def test_author_absent(self):
sample = load_sample("si-game.sample.html")
doc = Document(sample)
assert '[no-author]' == doc.author()

def test_keep_images_present(self):
sample = load_sample("summary-keep-all-images.sample.html")

doc = Document(sample)

assert "<img" in doc.summary(keep_all_images=True)

def test_keep_images_absent(self):
sample = load_sample("summary-keep-all-images.sample.html")

doc = Document(sample)

assert "<img" not in doc.summary(keep_all_images=False)

def test_keep_images_absent_by_defautl(self):
sample = load_sample("summary-keep-all-images.sample.html")

doc = Document(sample)

assert "<img" not in doc.summary()
Loading