diff --git a/nytdiff.py b/nytdiff.py
index e5fd615735..c2a7c69ecd 100644
--- a/nytdiff.py
+++ b/nytdiff.py
@@ -15,7 +15,7 @@
from pytz import timezone
import requests
import tweepy
-from simplediff import html_diff
+from simplediff import string_diff
from selenium import webdriver
TIMEZONE = 'America/Buenos_Aires'
@@ -183,12 +183,99 @@ def strip_html(self, html_str):
styles=styles,
strip=strip)
+ def html_diff(old, new):
+ """
+ Like simplediff.html_diff(), with a tweak: if a hunk
+ consists of only deleting or only adding at the beginning
+ or end of a word, then it's combined into one hunk.
+
+ Examples:
+ Changes from simplediff:
+ - Alice, Bob and Charlie
+ + Alice, Bob, and Charlie
+ simplediff: Alice, BobBob, Charlie
+ this diff: Alice, Bob, Charlie
+
+ - Alice Bob Charlie's Angels And David
+ + Alice Bob Charlie David
+ simplediff: Alice Bob Charlie's Angels AndCharlie David
+ this diff: Alice Bob Charlie's Angels And David
+
+ Same as simplediff:
+ hunks you wouldn't want simplified:
+ - Alice Bob Charlie
+ + Alice Robert Charlie
+ diff: Alice Bob Robert Charlie
+
+ if the change isn't only at the beginning or end:
+ - Alice Bob Charlie
+ + Alice Blob Charlie
+ diff: Alice Bob Blob Charlie
+
+ - Alice Bobby Charlie
+ + Alice bb Charlie
+ diff: Alice Bobby bb Charlie
+
+ - Alice Zeneca Charlie
+ + Alice AstraZeneca Charlie's
+ diff: Alice Zeneca Charlie AstraZeneca Charlie's
+ """
+ def hunk_to_html(op, words):
+ words = ' '.join(words)
+ if op == '-':
+ return '{}'.format(words)
+ if op == '+':
+ return '{}'.format(words)
+ return words
+
+ hunks = string_diff(old, new)
+ html = []
+ skip_next = False
+ for (prev_op, prev_words), (next_op, next_words) in zip(hunks[:-1], hunks[1:]):
+ if prev_op == '-' and next_op == '+':
+ if len(prev_words) == 1:
+ [old_word] = prev_words
+ first_new_word, last_new_word = next_words[0], next_words[-1]
+ if first_new_word.startswith(old_word):
+ next_words[0] = old_word + '' + first_new_word[len(old_word):]
+ html.append(' '.join(next_words) + '')
+ skip_next = True
+ continue
+ elif last_new_word.endswith(old_word):
+ next_words[-1] = last_new_word[:-len(old_word)] + '' + old_word
+ html.append('' + ' '.join(next_words))
+ skip_next = True
+ continue
+ if len(next_words) == 1:
+ [new_word] = next_words
+ first_old_word, last_old_word = prev_words[0], prev_words[-1]
+ if first_old_word.startswith(new_word):
+ prev_words[0] = new_word + '' + first_old_word[len(new_word):]
+ html.append(' '.join(prev_words) + '')
+ skip_next = True
+ continue
+ elif last_old_word.endswith(new_word):
+ prev_words[-1] = last_old_word[:-len(new_word)] + '' + new_word
+ html.append('' + ' '.join(prev_words))
+ skip_next = True
+ continue
+ if skip_next:
+ skip_next = False
+ continue
+ html.append(hunk_to_html(prev_op, prev_words))
+
+ if not skip_next:
+ html.append(hunk_to_html(*(hunks[-1])))
+
+ return ' '.join(html)
+
def show_diff(self, old, new):
if len(old) == 0 or len(new) == 0:
logging.info('Old or New empty')
return False
new_hash = hashlib.sha224(new.encode('utf8')).hexdigest()
- logging.info(html_diff(old, new))
+ htmldiff = self.html_diff(old, new)
+ logging.info(htmldiff)
html = """
@@ -202,7 +289,7 @@ def show_diff(self, old, new):