#!/usr/local/cpython-3.5/bin/python3 """Extract a CSV file from an HTML table.""" from __future__ import print_function import io import re import csv import sys import urllib.request # pylint: disable=no-name-in-module,import-error from html.parser import HTMLParser # pylint: disable=import-error quiet = False def make_used(variable): """Persuade linters that variable is used.""" assert True or variable def usage(retval): """Output a usage message.""" if retval == 0: write = sys.stdout.write else: write = sys.stderr.write write('Usage: {} --url http://www.example.com/ --file /path/to-file.html --strip-data --quiet\n'.format(sys.argv[0])) write('\n') write('--strip-data condenses runs of whitespace down to a single space,\n') write(' and removes leading and trailing whitespace\n') write('--numbered-outputs writes table-1.csv, table-2.csv, etc. Nice for inputs with more than one table\n') write('--quiet silences output related to comments.\n') sys.exit(retval) class TableExtractor(HTMLParser): # pylint: disable=too-many-instance-attributes """Extract a table from HTML.""" def __init__(self, strip_data, numbered_outputs): """Initialize.""" super(TableExtractor, self).__init__() self.strip_data = strip_data self.table_level = 0 self.table_count = 0 self.td_level = 0 self.tr_level = 0 self.csv_data = [] self.row = [] self.cell = [] self.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') # pylint: disable=no-member self.numbered_outputs = numbered_outputs def handle_starttag(self, tag, attrs): """Handle a start tag: table, td, tr.""" make_used(attrs) if tag == 'table': self.table_level += 1 self.table_count += 1 self.csv_data = [] elif tag in ('tr', 'th'): self.tr_level += 1 elif tag == 'td': self.td_level += 1 # We intentionally don't stress about other tags def handle_endtag(self, tag): """Handle an end tag: table, td, tr.""" if tag == 'table': if self.numbered_outputs: file_ = open('table-{}.csv'.format(self.table_count), 'w', encoding='utf-8') else: file_ = self.stdout csvwriter = csv.writer(file_, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) for row in self.csv_data: csvwriter.writerow(row) if self.numbered_outputs: file_.close() else: self.stdout.flush() self.table_level -= 1 elif tag in ('tr', 'th'): if self.tr_level > 0: self.csv_data.append(self.row) self.row = [] self.tr_level -= 1 elif tag == 'td': if self.td_level > 0: self.row.append(' '.join(self.cell)) self.cell = [] self.td_level -= 1 # We intentionally don't stress about other tags def handle_data(self, data): """Handle data.""" if self.td_level > 0: if self.strip_data: data = re.sub(r'\r', ' ', data) data = re.sub(r'\s+', ' ', data) data = data.strip() if data: self.cell.append(data) @staticmethod def handle_comment(data): """Handle comments.""" if not quiet: sys.stderr.write("Found comment: {}\n".format(data)) def process(html_string, strip_data, numbered_outputs): """Extract one or more tables from html_string.""" # print(html_string) table_extractor = TableExtractor(strip_data, numbered_outputs) table_extractor.feed(html_string) if not numbered_outputs and table_extractor.table_count != 1: sys.stderr.write('{}: Warning: {} tables found\n'.format(sys.argv[0], table_extractor.table_count)) def main(): """Extract HTML tables to CSV.""" do_urllib = False do_file = False url = None filename = None strip_data = False numbered_outputs = False global quiet while sys.argv[1:]: if sys.argv[1] == '--url': do_urllib = True url = sys.argv[2] del sys.argv[1] elif sys.argv[1] == '--file': do_file = True filename = sys.argv[2] del sys.argv[1] elif sys.argv[1] == '--quiet': quiet = True elif sys.argv[1] == '--strip-data': strip_data = True elif sys.argv[1] == '--numbered-outputs': numbered_outputs = True elif sys.argv[1] in ('-h', '--help'): usage(0) else: sys.stderr.write('{}: Unrecognized option: {}\n'.format(sys.argv[0], sys.argv[1])) usage(1) del sys.argv[1] if do_urllib: if do_file: sys.stderr.write('{}: --url and --file are mutually exclusive\n'.format(sys.argv[0])) usage(1) else: with urllib.request.urlopen(url) as response: # pylint: disable=no-member html = response.read().decode('utf-8') else: if do_file: with open(filename, 'r', encoding="utf-8") as file_: html = file_.read() else: sys.stderr.write('{}: You must specify one of --url or --file\n'.format(sys.argv[0])) usage(1) process(html, strip_data, numbered_outputs) main()