Improve link and image parsers

This commit is contained in:
vas3k
2020-02-01 20:02:41 +01:00
parent 0ba301315a
commit 59351f7c88

View File

@@ -114,7 +114,7 @@ def refresh_feed(item):
print(f"Entries found: {len(feed.entries)}")
for entry in feed.entries[:DEFAULT_ENTRIES_LIMIT]:
entry_title = parse_title(entry)
entry_link = entry.get("link")
entry_link = parse_link(entry)
if not entry_title or not entry_link:
print("No entry title or link. Skipped")
continue
@@ -154,7 +154,7 @@ def refresh_feed(item):
article.image = lead_image[:512]
# get real url
real_url, content_type, content_length = resolve_url(entry)
real_url, content_type, content_length = resolve_url(entry_link)
# load and summarize article
if content_length <= MAX_PARSABLE_CONTENT_LENGTH \
@@ -201,8 +201,8 @@ def check_conditions(conditions, entry):
return True
def resolve_url(entry):
url = entry.link
def resolve_url(entry_link):
url = str(entry_link)
content_type = None
content_length = MAX_PARSABLE_CONTENT_LENGTH + 1 # don't parse null content-types
depth = 10
@@ -212,7 +212,7 @@ def resolve_url(entry):
try:
response = requests.head(url, timeout=REQUEST_TIMEOUT, verify=False)
except RequestException:
log.warning(f"Failed to resolve URL: {entry.link}")
log.warning(f"Failed to resolve URL: {url}")
return None, content_type, content_length
if 300 < response.status_code < 400:
@@ -245,11 +245,27 @@ def parse_title(entry):
return re.sub("<[^<]+?>", "", title).strip()
def parse_link(entry):
if entry.get("link"):
return entry["link"]
if entry.get("links"):
return entry["links"][0]["href"]
return None
def parse_image(entry):
if entry.get("media_content"):
images = [m["url"] for m in entry["media_content"] if m.get("medium") == "image" and m.get("url")]
if images:
return images[0]
if entry.get("image"):
if isinstance(entry["image"], dict):
return entry["image"].get("href")
return entry["image"]
return None