From 1b8cf4d1dcf0e12b476f863cd326e9e2035faf11 Mon Sep 17 00:00:00 2001 From: egeakman Date: Sun, 13 Apr 2025 11:16:35 -0400 Subject: [PATCH 1/3] Fix Bluesky --- src/models/europython.py | 4 ++++ tests/test_social_media_extractions.py | 2 ++ 2 files changed, 6 insertions(+) diff --git a/src/models/europython.py b/src/models/europython.py index 7f6eae9..0946a79 100644 --- a/src/models/europython.py +++ b/src/models/europython.py @@ -136,6 +136,10 @@ def extract_bluesky_url(text: str) -> str: if text.startswith("www."): text = text[4:] + # Remove @ if present + if text.startswith("@"): + text = text[1:] + for marker in ("bsky.app/profile/", "bsky/"): if marker in text: text = text.split(marker, 1)[1] diff --git a/tests/test_social_media_extractions.py b/tests/test_social_media_extractions.py index 0c9f515..b570f29 100644 --- a/tests/test_social_media_extractions.py +++ b/tests/test_social_media_extractions.py @@ -38,7 +38,9 @@ def test_extract_linkedin_url(input_string: str, result: str) -> None: ("input_string", "result"), [ ("username", "https://bsky.app/profile/username.bsky.social"), + ("@username", "https://bsky.app/profile/username.bsky.social"), ("username.dev", "https://bsky.app/profile/username.dev"), + ("@username.dev", "https://bsky.app/profile/username.dev"), ("username.bsky.social", "https://bsky.app/profile/username.bsky.social"), ("bsky.app/profile/username", "https://bsky.app/profile/username.bsky.social"), ("bsky/username", "https://bsky.app/profile/username.bsky.social"), From 1a1ba146fb04926a1b5848113fa5deebd729a752 Mon Sep 17 00:00:00 2001 From: egeakman Date: Sun, 13 Apr 2025 11:22:15 -0400 Subject: [PATCH 2/3] Expand mastodon support --- src/models/europython.py | 28 +++++++++++++++++--------- tests/test_social_media_extractions.py | 1 + 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/src/models/europython.py b/src/models/europython.py index 0946a79..371ce7f 100644 --- a/src/models/europython.py +++ b/src/models/europython.py @@ -93,16 +93,26 @@ def extract_twitter_url(text: str) -> str: @staticmethod def extract_mastodon_url(text: str) -> str: """ - Extract the Mastodon URL from the answer, handle @username@instance format + Normalize Mastodon handle or URL to the format: https:///@ """ - if not text.startswith(("https://", "http://")) and text.count("@") == 2: - mastodon_url = f"https://{text.split('@')[2]}/@{text.split('@')[1]}" - else: - mastodon_url = ( - f"https://{text.removeprefix('https://').removeprefix('http://')}" - ) + text = text.strip().split("?", 1)[0] + + # Handle @username@instance or username@instance formats + if "@" in text and not text.startswith("http"): + parts = text.split("@") + if len(parts) == 3: # @username@instance + _, username, instance = parts + elif len(parts) == 2: # username@instance + username, instance = parts + else: + raise ValueError("Invalid Mastodon handle format") + return f"https://{instance}/@{username}" + + # Handle full URLs + if text.startswith("http://"): + text = "https://" + text[len("http://") :] - return mastodon_url.split("?")[0] + return text @staticmethod def extract_linkedin_url(text: str) -> str: @@ -126,7 +136,7 @@ def extract_bluesky_url(text: str) -> str: Returns a normalized BlueSky URL in the form https://bsky.app/profile/.bsky.social, or uses the entire domain if it's custom (e.g., .dev). """ - text = text.split("?", 1)[0].strip() + text = text.strip().split("?", 1)[0] if text.startswith("https://"): text = text[8:] diff --git a/tests/test_social_media_extractions.py b/tests/test_social_media_extractions.py index b570f29..5c13a50 100644 --- a/tests/test_social_media_extractions.py +++ b/tests/test_social_media_extractions.py @@ -13,6 +13,7 @@ "https://mastodon.social/@username", ), ("@username@mastodon.social", "https://mastodon.social/@username"), + ("username@mastodon.social", "https://mastodon.social/@username"), ], ) def test_extract_mastodon_url(input_string: str, result: str) -> None: From f8f52c571628b97e175cabaa94da9221b43e4771 Mon Sep 17 00:00:00 2001 From: egeakman Date: Sun, 13 Apr 2025 11:29:01 -0400 Subject: [PATCH 3/3] Let's not stop the script for a single invalid input --- src/models/europython.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/models/europython.py b/src/models/europython.py index 371ce7f..ce2c5d2 100644 --- a/src/models/europython.py +++ b/src/models/europython.py @@ -91,7 +91,7 @@ def extract_twitter_url(text: str) -> str: return twitter_url.split("?")[0] @staticmethod - def extract_mastodon_url(text: str) -> str: + def extract_mastodon_url(text: str) -> None | str: """ Normalize Mastodon handle or URL to the format: https:///@ """ @@ -105,7 +105,7 @@ def extract_mastodon_url(text: str) -> str: elif len(parts) == 2: # username@instance username, instance = parts else: - raise ValueError("Invalid Mastodon handle format") + return None return f"https://{instance}/@{username}" # Handle full URLs