diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 00000000..73784a4a --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,40 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: Python package + +on: + push: + branches: [ "master" ] + pull_request: + branches: [ "master" ] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pytest diff --git a/readability/readability.py b/readability/readability.py index c86e7d17..286841ce 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -210,12 +210,13 @@ def get_clean_html(self): """ return clean_attributes(tounicode(self.html, method="html")) - def summary(self, html_partial=False): + def summary(self, html_partial=False, keep_all_images=False): """ Given a HTML file, extracts the text of the article. :param html_partial: return only the div of the document, don't wrap in html and body tags. + :param keep_all_images: Keep all images in summary. Warning: It mutates internal DOM representation of the HTML document, so it is better to call other API methods before this one. @@ -257,7 +258,7 @@ def summary(self, html_partial=False): article = self.html.find("body") if article is None: article = self.html - cleaned_article = self.sanitize(article, candidates) + cleaned_article = self.sanitize(article, candidates, keep_all_images) article_length = len(cleaned_article or "") retry_length = self.retry_length @@ -502,7 +503,7 @@ def reverse_tags(self, node, *tag_names): for tag_name in tag_names: yield from reversed(node.findall(".//%s" % tag_name)) - def sanitize(self, node, candidates): + def sanitize(self, node, candidates, keep_all_images=False): MIN_LEN = self.min_text_length for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"): if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33: @@ -563,8 +564,8 @@ def sanitize(self, node, candidates): to_remove = False reason = "" - # if el.tag == 'div' and counts["img"] >= 1: - # continue + if keep_all_images and el.tag == 'div' and counts["img"] >= 1: + continue if counts["p"] and counts["img"] > 1 + counts["p"] * 1.3: reason = "too many images (%s)" % counts["img"] to_remove = True diff --git a/requirements-dev.txt b/requirements-dev.txt index 4731fa9d..6160e33c 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,4 +1,6 @@ lxml +lxml_html_clean +pytest chardet nose pep8 diff --git a/tests/samples/summary-keep-all-images.sample.html b/tests/samples/summary-keep-all-images.sample.html new file mode 100644 index 00000000..127683fc --- /dev/null +++ b/tests/samples/summary-keep-all-images.sample.html @@ -0,0 +1,29 @@ + + + + +

+ + H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline + +

+

+ + Text Text Text Text Text Text Text Text Text Text + +

+
+ + + + + +
+

+ + Text Text Text Text Text Text Text Text Text Text + +

+ + \ No newline at end of file diff --git a/tests/test_article_only.py b/tests/test_article_only.py index c5592cfb..1835d9fe 100644 --- a/tests/test_article_only.py +++ b/tests/test_article_only.py @@ -133,3 +133,24 @@ def test_author_absent(self): sample = load_sample("si-game.sample.html") doc = Document(sample) assert '[no-author]' == doc.author() + + def test_keep_images_present(self): + sample = load_sample("summary-keep-all-images.sample.html") + + doc = Document(sample) + + assert "