#!/usr/bin/python3 # pylint: disable=unused-import """Take one or more URL's as input, and output URL's referenced by them.""" import sys import json import typing import urllib import urllib.parse import collections import bs4 import requests def make_used(variable: object) -> None: """Convince pyflakes that variable is used.""" assert True or variable make_used(typing) def usage(retval: int) -> None: """Output a usage message.""" if retval: write = sys.stderr.write else: write = sys.stdout.write string = 'Usage: {} --verbose --focus-prefix http://host/ --timeout seconds --as-json --urls url1 url2 ... urln\n' write(string.format(sys.argv[0])) write('\n') write('--focus-prefix may be specified more than once.\n') write('\n') write('Default output format is 3-column csv. The first column is the kind of problem detected, the second is the\n') write('link that had a problem, and the third column is the referrer link.\n') write('\n') write('Or you can specify --as-json, which is mostly self-explanatory.\n') sys.exit(retval) def to_ascii(string: str) -> str: """Convert str to ASCII.""" with_xmlcharref = string.encode('ASCII', errors='xmlcharrefreplace') as_str = with_xmlcharref.decode() return as_str def dead_link(*, candidate_url: str, referrer_url: str, malformed: bool) -> None: """Announce that "url" is a dead link.""" ascii_candidate_url = to_ascii(candidate_url) ascii_referrer_url = to_ascii(referrer_url) if malformed: kind = 'malformed' else: kind = 'dead' if GLOBALS.as_json: GLOBALS.list_of_bad.append({'problem': kind, 'bad_link': ascii_candidate_url, 'referrer_link': ascii_referrer_url}) else: print('%s|%s|%s' % (kind, ascii_candidate_url, ascii_referrer_url)) class Globals: # pylint: disable=too-few-public-methods """Hold globals related to finding dead links - command line options in particular.""" def __init__(self) -> None: """Initialize.""" # This will hold a deque of tuples, where the 1st element of the tuple is the referrer URL, and the 2nd is # the URL that's dead. self.urls = None # type: typing.Deque[typing.Tuple[str, str]] self.verbose = False self.timeout = 30.0 self.focus_prefixes = [] # type: typing.List[str] self.list_of_bad = [] # type: typing.List[typing.Dict[str, str]] self.as_json = False def parse_options(self) -> None: """Parse command line options.""" while sys.argv[1:]: if sys.argv[1] == '--urls': self.urls = collections.deque(('command line', url) for url in sys.argv[2:]) break elif sys.argv[1] == '--as-json': self.as_json = True elif sys.argv[1] == '--timeout': self.timeout = float(sys.argv[2]) del sys.argv[1] elif sys.argv[1] == '--focus-prefix': self.focus_prefixes.append(sys.argv[2]) del sys.argv[1] elif sys.argv[1] in ('-v', '--verbose'): self.verbose = True elif sys.argv[1] in ('-h', '--help'): usage(0) else: sys.stderr.write('{}: Unrecognized option: {}\n'.format(sys.argv[0], sys.argv[1])) usage(1) del sys.argv[1] if self.urls is None: sys.stderr.write('{}: --urls is a required option\n'.format(sys.argv[0])) usage(1) if not self.focus_prefixes: sys.stderr.write('{}: --focus-prefix prefix is a required option\n'.format(sys.argv[0])) usage(1) GLOBALS = Globals() def tty_output(string: str) -> None: """Output byte string to tty, if GLOBALS.verbose.""" if GLOBALS.verbose: sys.stderr.write(string) sys.stderr.write('\n') def is_in_focus_prefixes(url: str) -> bool: """Return True iff url is has a prefix in focus_prefixes.""" for focus_prefix in GLOBALS.focus_prefixes: if url.startswith(focus_prefix): return True return False def main() -> None: # pylint: disable=too-many-branches,too-many-statements """Parse command line options and retrieve URL's to be output.""" GLOBALS.parse_options() already_seen = set() # type: typing.Set[typing.Tuple[str, str]] while GLOBALS.urls: referrer_url, candidate_url = GLOBALS.urls.popleft() tty_output('checking %s' % to_ascii(candidate_url)) if (referrer_url, candidate_url) in already_seen: # If we've seen this URL before, do not reprocess it. tty_output('already seen: referrer: %s, candidate: %s' % (to_ascii(referrer_url), to_ascii(candidate_url))) continue # We have not seen this URL before - make sure we don't reprocess it later. already_seen.add((referrer_url, candidate_url)) try: page = requests.get(candidate_url, timeout=GLOBALS.timeout) except OSError: # Note that requests.exceptions.Timeout is an descendent of OSError dead_link(referrer_url=referrer_url, candidate_url=candidate_url, malformed=False) continue except requests.packages.urllib3.exceptions.LocationParseError: # This is a weird URL - EG: portal.1stel.com:443~cbbrowne . skip it. dead_link(referrer_url=referrer_url, candidate_url=candidate_url, malformed=True) continue if not page.ok: tty_output('fred 1: not page.ok: page.status_code: %d' % page.status_code) dead_link(referrer_url=referrer_url, candidate_url=candidate_url, malformed=False) continue source_url = candidate_url del candidate_url if not is_in_focus_prefixes(source_url): # This is not on my website, it should be one hop away from my website - do not process it tty_output('Skipping non-focus %s' % to_ascii(source_url)) continue page_content = page.content soup = bs4.BeautifulSoup(page_content, 'lxml') # tree = lxml.html.fromstring(page.content) a_tags = soup.findAll('a') for a_element in a_tags: href = a_element.get('href') abs_href = urllib.parse.urljoin(source_url, href) if is_in_focus_prefixes(abs_href): # This is on my website, so I always want to process it. tty_output('Adding 1 %s' % to_ascii(abs_href)) GLOBALS.urls.append((source_url, abs_href)) continue if is_in_focus_prefixes(source_url): # This is not on my website, but it is one hop away from my website - process it tty_output('Adding 2 %s' % to_ascii(abs_href)) GLOBALS.urls.append((source_url, abs_href)) continue # If we get here, then the abs_href is on an outside website, and we were referred to it by # an outside website, so we do not want to check it further. tty_output('Ignoring %s going forward...' % to_ascii(abs_href)) if GLOBALS.as_json: print(json.dumps(GLOBALS.list_of_bad, sort_keys=True, indent=4)) main()