#!/usr/bin/python3 # pylint: disable=superfluous-parens,wrong-import-position # superfluous-parens: Parentheses are good for clarity and portability """ Check for changes in a series of URL's. E-mail diffs as needed. The URL's to check come from ~/.page-change/urls, one URL per line. """ import io import os import re import sys import difflib import subprocess sys.path.insert(0, os.path.expanduser('~/lib')) import mailer as mailer_mod # noqa: disable=E402 def find_links_binary(): """Return True iff links binary is on our path.""" # This didn't help with a highly dynamic page, but I've added it here as a comment in case I revisit that # someday. # 'elinks' '-no-numbering' '-dump' 'https://apps.fedoraproject.org/packages/bash' retval = os.system('links --help > /dev/null 2>&1') if retval is None: exit_code = 0 else: exit_code = retval / 256 return exit_code in [0, 3] def get_prior_url_filename(directory, url): """Derive a filename from a url.""" return os.path.join(directory, url.replace(b'/', b'-')) def put_current_url_lines(directory, url, text): """Save the URL text.""" prior_url_filename = get_prior_url_filename(directory, url) file_ = open(prior_url_filename, 'wb') for line in text: file_.write(b'%s\n' % line.rstrip()) file_.close() def get_prior_url_lines(directory, url): """Read the previous content of the URL from disk.""" prior_url_filename = get_prior_url_filename(directory, url) try: file_ = open(prior_url_filename, 'rb') sans_newlines = [line.rstrip() for line in file_] spaces_shortened = [re.sub(b' *', b' ', line) for line in sans_newlines] file_.close() except IOError: spaces_shortened = [] return spaces_shortened def make_used(variable): """Persuade pyflakes that 'variable' is used.""" assert True or variable def get_current_url_lines(url): """Retrieve the current URL text using links, so that we get an ASCII dump of the page.""" command = ['links', '-html-numbered-links', '0', '-dump', url] subp = subprocess.Popen(command, stdout=subprocess.PIPE) file_outputs = subp.communicate() stdout = io.BytesIO(file_outputs[0]) _stderr = io.BytesIO(file_outputs[1]) make_used(_stderr) exit_code = subp.returncode if exit_code not in [0, None]: print(exit_code) raise IOError for line in stdout: sans_newline = line.rstrip() spaces_shortened = re.sub(b' *', b' ', sans_newline) yield spaces_shortened def dump(text, filename): """Dump a list of strings or string to a filename for debugging.""" file_ = open(filename, 'wb') if isinstance(text, list): file_.write(b'list\n') for line in text: file_.write(b'%s\n' % line.rstrip()) else: file_.write(b'string\n') file_.write(text) file_.close() def main(): # pylint: disable=too-many-locals """Check list of URL's for changes.""" verbose = True if not find_links_binary(): sys.stderr.write('%s: Could not find links\n' % sys.argv[0]) mailer = mailer_mod.Mailer('drs4auto@gmail.com') directory = os.path.expanduser(b'~/.page-change') try: urls_file = open(os.path.join(directory, b'urls'), 'rb') except IOError: sys.stderr.write('%s: Error opening %s\n' % (sys.argv[0], os.path.join(directory, 'urls'))) sys.exit(1) for raw_url in urls_file: url = re.sub(b'#.*$', b'', raw_url).strip() if not url: continue if verbose: sys.stderr.write('getting current for %s\n' % url) try: current_url_lines = list(get_current_url_lines(url)) except (IOError, OSError): sys.stderr.write('Error getting url {}\n'.format(url)) continue if verbose: sys.stderr.write('getting prior\n') prior_url_lines = list(get_prior_url_lines(directory, url)) if prior_url_lines == current_url_lines: sys.stderr.write('no change\n') continue differ = difflib.HtmlDiff() if verbose: sys.stderr.write('diffing\n') # We're almost completely byte strings, except differ.make_table really wants strings :-S decoded_prior_url_lines = [line.decode("utf-8", 'replace') for line in prior_url_lines] decoded_current_url_lines = [line.decode("utf-8", 'replace') for line in current_url_lines] diffs_in_html = differ.make_table( decoded_prior_url_lines, decoded_current_url_lines, context=True, ) diffs_in_html = diffs_in_html.replace(' ', ' ') if verbose: sys.stderr.write('emailing\n') address = 'strombrg@gmail.com' subject = 'Changes in %s' % url from_address = address to_address = address mailer.send(from_address, to_address, subject, diffs_in_html, is_html=True) put_current_url_lines(directory, url, current_url_lines) urls_file.close() main()