#!/usr/bin/python3

"""Index a collection of filenames retrieved from stdin"""

import re
import sys
import time

sys.path.append("/usr/local/lib")

import base255
import cachedb
import readline0

DBNUMBER = 1

TIME0 = -1

if DBNUMBER == 1:
    import dbm.gnu

    DBTYPE = dbm.gnu
elif DBNUMBER == 2:
    # this one appears to be the fastest by far, at least at around 4000 messages.
    raise ValueError("Python 3.x no longer builds dbm.bsd out of the box")
elif DBNUMBER == 3:
    raise ValueError("DBNUMBER 3 no longer supported")
    # import dbm.bsd
    # DBTYPE = dbm.bsd
elif DBNUMBER == 4:
    raise ValueError("DBNUMBER 4 no longer supported")
    # import dbm
    # DBTYPE = dbm
elif DBNUMBER == 5:
    raise ValueError("DBNUMBER 5 no longer supported")
    # import anydbm
    # DBTYPE = anydbm
elif DBNUMBER == 6:
    raise ValueError("DBNUMBER 6 no longer supported")
    # import dumbdbm
    # DBTYPE = dumbdbm
else:
    sys.stderr.write("Sorry, illegal database number selected\n")
    sys.exit(1)


def make_used(*variables):
    """Convince linter that 'variables' are used."""
    assert True or variables


def usage(retval):
    """Output a usage message"""
    sys.stderr.write("Usage: %s -d databasename [-u] [-c] [-s keyword] [-D] [-v]\n" % sys.argv[0])
    sys.stderr.write("-u says to update the database with all files on stdin\n")
    sys.stderr.write("-c says to start a new database with all files on stdin\n")
    sys.stderr.write("-s keyword says to list all files containing keyword\n")
    sys.stderr.write("-D says to dump the database contents to stdout (mostly useful for debugging)\n")
    sys.stderr.write("-v says to operate verbosely (can be repeated)\n")
    sys.stderr.write("-A says to use a heuristic to skip lines that look like base64/uuencode/binhex or similar\n")
    sys.stderr.write("-p says to skip indexing pronouns\n")
    sys.stderr.write("-n says to skip indexing numbers (all decimal digits)\n")
    sys.stderr.write("-N says to skip indexing words containing any digits\n")
    sys.stderr.write("-0 says filename list on stdin will be null terminated, not newline terminated\n")
    sys.stderr.write("\n")
    sys.stderr.write("In any case -d is required\n")
    sys.stderr.write("You must specify exactly one of -c, -u, -s or -D\n")
    sys.exit(retval)


class pyindex_class:
    """Index a collection of files"""

    def __init__(self, dbname, local_filename_collection_class, verbose):
        self.dbname = dbname
        self.local_filename_collection_class = local_filename_collection_class
        # self.db_filename_to_abbreviation = None
        # self.highest_filename_abbreviation = None
        # self.highest_word_abbreviation = None
        # self.db_word_to_abbreviation = None
        # self.db_abbrv_word_to_abbrv_fns = None
        # self.db_abbreviation_to_filename = None
        # self.db_abbreviation_to_word = None
        self.mode = None
        self.verbose = verbose

    def creating_from_scratch(self):
        """Return True iff we're creating a database from scratch"""
        if self.mode == "n":
            return True
        else:
            return False

    def open(self, mode):
        """Open the `database`"""
        # 1 is good for normal use
        # 2 is good for testing cache behaviors, because it won't take as long to exercise cache features
        cache_size_set = 0
        if cache_size_set == 0:
            expensive_item_cache_sizes = 5000000
            inexpensive_item_cache_sizes = 800000
        elif cache_size_set == 1:
            expensive_item_cache_sizes = 500000
            inexpensive_item_cache_sizes = 80000
        elif cache_size_set == 2:
            expensive_item_cache_sizes = 5000
            inexpensive_item_cache_sizes = 800
        else:
            raise SystemExit("{}: internal error: cache_size_set has an invalid value\n".format(sys.argv[0]))

        # we key off of this value to see if we're recreating from scratch.
        # it should generally be one of:
        # 1) r - read only
        # 2) w - read/write, and don't create if nonexistent
        # 3) c - read/write, and do create if nonexistent
        # 4) n - read/write, and always make an empty database, even if one already existed
        self.mode = mode

        # key: base255 abbrv_word -> value: list of base255 abbrv filenames -> on disk: a single string that comprises a
        # list of \0 separated base255 abbrv filenames In this we're much more than just I/O caching.  We're also
        # keeping, in the cache, a python list (or set) of the base255 numbers so we don't have to constantly convert
        # the string to a list and back, which was really killing the running time of the program.  Then if we need an
        # entry back in the cache from disk, it is automatically converted from the single-string representation back to
        # the python list
        self.db_abbrv_word_to_abbrv_fns = cachedb.Database_cache(
            self.dbname + ".abbrv-word-to-abbrv-filelist",
            DBTYPE,
            mode,
            to_string=lambda x: x.to_binary(),
            from_string=self.local_filename_collection_class,
            max_elements_in_memory=expensive_item_cache_sizes,
            write_through=False,
            too_many_percent=88.0,
            verbose=True,
        )

        # key: base255 abbrv filename -> value: filename -> on disk: filename
        # In this we're only doing I/O caching
        self.db_abbreviation_to_filename = cachedb.Database_cache(
            self.dbname + ".abbreviation-to-filename",
            DBTYPE,
            mode,
            max_elements_in_memory=inexpensive_item_cache_sizes,
            write_through=False,
            verbose=True,
        )

        # key: filename -> value: base255 filename abbreviation -> on disk: base255 filename abbreviation
        # In this we're only doing I/O caching
        self.db_filename_to_abbreviation = cachedb.Database_cache(
            self.dbname + ".filename-to-abbreviation",
            DBTYPE,
            mode,
            max_elements_in_memory=inexpensive_item_cache_sizes,
            write_through=False,
            verbose=True,
        )

        # key: base255 abbrv word -> value: word -> on disk: word
        # In this we're only doing I/O caching
        self.db_abbreviation_to_word = cachedb.Database_cache(
            self.dbname + ".abbreviation-to-word",
            DBTYPE,
            mode,
            max_elements_in_memory=inexpensive_item_cache_sizes,
            write_through=False,
            verbose=True,
        )

        # key: word -> value: word abbreviation as base255 -> on disk: word abbreviation as base255
        # In this we're only doing I/O caching
        self.db_word_to_abbreviation = cachedb.Database_cache(
            self.dbname + ".word-to-abbreviation",
            DBTYPE,
            mode,
            max_elements_in_memory=inexpensive_item_cache_sizes,
            write_through=False,
            verbose=True,
        )

        # Find the highest word and filename abbreviations.  Assume that's a good place to start counting from, rather
        # than trying to "fill in holes" :).  This means faster performance for databases that are removed and recreated
        # from time to time, but slower performance for databases that are created and modified and modified and
        # modified...
        self.highest_word_abbreviation = 0
        self.highest_filename_abbreviation = 0
        for abbrv_word in self.db_abbreviation_to_word.keys():
            number = base255.base255_to_number(abbrv_word)
            if number > self.highest_word_abbreviation:
                self.highest_word_abbreviation = number
        for abbrv_filename in self.db_abbreviation_to_filename.keys():
            number = base255.base255_to_number(abbrv_filename)
            if number > self.highest_filename_abbreviation:
                self.highest_filename_abbreviation = number

    def abbrv_fn_to_fn(self, abbrv_filename):
        """Translate an abbreviated filename to a filename"""
        return self.db_abbreviation_to_filename.get(abbrv_filename, b"")

    def fn_to_abbrv_fn(self, filename):
        """Translate a filename to an abbrv filename"""
        return self.db_filename_to_abbreviation.get(filename, b"")

    def abbrv_word_to_word(self, abbrv_word):
        """Translate an abbreviated word to a word"""
        # sys.stderr.write('In abbrv_word_to_word, abbrv word is %s\n' % str(abbrv_word))
        # sys.stderr.write('In abbrv_word_to_word, keys are %s\n' % str(self.db_abbreviation_to_word.keys()))
        # return dict_get(lambda x: self.db_abbreviation_to_word[x], abbrv_word, '')
        return self.db_abbreviation_to_word.get(abbrv_word, b"")

    def word_to_abbrv_word(self, word):
        """Translate a word to an abbrv word"""
        return self.db_word_to_abbreviation.get(word, b"")

    def __contains__(self, word):
        return word in self.db_word_to_abbreviation

    def keys(self):
        """Return a list of keys in the dictionary"""
        abbrv_words = self.db_abbrv_word_to_abbrv_fns.keys()
        for abbrv_word in abbrv_words:
            if abbrv_word == b"":
                sys.stderr.write(
                    '%s: in keys(), internal error: could not map abbreviated word "%s" to a word\n'
                    % (
                        sys.argv[0],
                        abbrv_word,
                    )
                )
                sys.exit(1)
            yield self.db_abbreviation_to_word[abbrv_word]

    def __getitem__(self, word):
        abbreviated_word = self.word_to_abbrv_word(word)
        try:
            abbreviated_filenames = self.db_abbrv_word_to_abbrv_fns[abbreviated_word].files()
        except KeyError:
            abbreviated_filenames = []
        unabbreviated_filenames = [self.abbrv_fn_to_fn(abbreviated_filename) for abbreviated_filename in abbreviated_filenames]
        result = [string.decode("ISO-8859-1") for string in unabbreviated_filenames]
        return result
        # return map( \
        #     self.abbrv_fn_to_fn, \
        #     self.db_abbrv_word_to_abbrv_fns.get(self.word_to_abbrv_word(word), []).files())

    def add_filename_to_words(self, filename, words):
        """Add a filename to a list of words"""
        if self.verbose >= 3:
            sys.stderr.write("Adding words %s\n" % " ".join(words))
        abbrv_filename = self.fn_to_abbrv_fn(filename)
        if abbrv_filename == b"":
            # filename abbreviation does not yet exist, set up the filename abbreviation
            self.highest_filename_abbreviation += 1
            abbrv_filename = base255.number_to_base255(self.highest_filename_abbreviation)
            self.db_abbreviation_to_filename[abbrv_filename] = filename
            self.db_filename_to_abbreviation[filename] = abbrv_filename
        for word in words:
            abbrv_word = self.word_to_abbrv_word(word)
            if abbrv_word == b"":
                # word abbreviation does not yet exist, set up the word abbreviation
                self.highest_word_abbreviation += 1
                abbrv_word = base255.number_to_base255(self.highest_word_abbreviation)
                self.db_abbreviation_to_word[abbrv_word] = word
                self.db_word_to_abbreviation[word] = abbrv_word
            if abbrv_word in self.db_abbrv_word_to_abbrv_fns:
                if self.verbose >= 4:
                    sys.stderr.write("Adding filename %s to word %s\n" % (filename, word))
                self.db_abbrv_word_to_abbrv_fns[abbrv_word].add(abbrv_filename)
            else:
                # An abbreviated filename list for this abbreviated word does not yet exist - create one
                if self.verbose >= 4:
                    sys.stderr.write("Adding filename %s to word %s by starting a new word\n" % (filename, word))
                self.db_abbrv_word_to_abbrv_fns[abbrv_word] = self.local_filename_collection_class(
                    self.db_filename_to_abbreviation[filename],
                )

    def close(self):
        """Close our databases"""
        self.db_abbrv_word_to_abbrv_fns.close()
        self.db_abbreviation_to_filename.close()
        self.db_filename_to_abbreviation.close()
        self.db_abbreviation_to_word.close()
        self.db_word_to_abbreviation.close()


def good_length(word):
    """Return True iff a word is "of a good length"""
    return word[2:] and not word[20:]


WORD_REGEX = re.compile(rb"\w+")
WHITESPACE_REGEX = re.compile(rb"\s")
NUMBER_REGEX = re.compile(rb"\d")
PRONOUN_LIST = [
    "i",
    "all",
    "another",
    "any",
    "anybody",
    "anyone",
    "anything",
    "both",
    "each",
    "either",
    "everbody",
    "everyone",
    "everything",
    "few",
    "he",
    "her",
    "hers",
    "herself",
    "him",
    "himself",
    "his",
    "it",
    "its",
    "itself",
    "many",
    "me",
    "mine",
    "my",
    "myself",
    "neither",
    "no",
    "nobody",
    "none",
    "nothing",
    "one",
    "one",
    "others",
    "our",
    "ours",
    "ourselves",
    "several",
    "she",
    "some",
    "somebody",
    "someone",
    "something",
    "that",
    "their",
    "theirs",
    "them",
    "themselves",
    "these",
    "they",
    "this",
    "this",
    "us",
    "we",
    "what",
    "which",
    "who",
    "whom",
    "whose",
    "you",
    "your",
    "yours",
    "yourself",
    "yourselves",
]
# if a pronoun doesn't have a "good length", it'll never be a match anyway, so eliminate the bad length pronouns from
# the list to search
PRONOUNS = set(pronoun.encode("ISO-8859-1") for pronoun in PRONOUN_LIST if good_length(pronoun))

# this version uses a hash table (python "dictionary") to speed up determining if a word is a pronoun.  It's probably
# faster than using a binary search on a sorted array (python "list"), but I believe it's worthwhile to keep the
# possibility of a binary search in reserve if the dict proves too time consuming


def is_not_a_pronoun(word):
    """Return True iff word is not a pronoun"""
    if word in PRONOUNS:
        return False
    else:
        return True


def does_not_contain_a_digit(word):
    """Return True iff word does not contain a digit"""
    if NUMBER_REGEX.search(word) is None:
        return True
    else:
        return False


def is_not_a_number(word):
    """Return True iff word is not a number"""
    if NUMBER_REGEX.match(word) is None:
        return True
    else:
        return False


def split_file_into_words(file, avoid_ascii_coded_binary, skip_pronouns, skip_numbers, skip_words_with_digits):
    """Split a file into words"""
    # FIXME: unique_dict should become unique_set
    unique_dict = {}
    for line in file:
        # \d When the UNICODE flag is not specified, matches any decimal digit; this is equivalent to the set [0-9].
        # With UNICODE, it will match whatever is classified as a digit in the Unicode character properties database.

        # \w When the LOCALE and UNICODE flags are not specified, matches any alphanumeric character and the underscore;
        # this is equivalent to the set [a-zA-Z0-9_]. With LOCALE, it will match the set [0-9_] plus whatever characters
        # are defined as alphanumeric for the current locale.  If UNICODE is set, this will match the characters [0-9_]
        # plus whatever is classified as alphanumeric in the Unicode character properties database.
        line = line.strip()
        if avoid_ascii_coded_binary and line[20:] and WHITESPACE_REGEX.search(line) is None:
            # Skip this line - it's over 20 characters long, and it contains no whitespace (aside from at its beginning
            # and/or end), so it's very, very likely to be just a bunch of base64 (or other binary-to-text mapping like
            # binhex or uuencode) gibberish that we don't need (indeed, that we don't -want-) to index (because many
            # people's mail is going to have a ton of this binary-to-text nonsense, and it isn't meaningful to search
            # through it in this sense.  If we were to convert the gibberish to some sort of meaningful representation -
            # for example, by converting an openoffice document to text, -then- it would be reasonable to search through
            # -that-.  But we aren't)
            #
            # someday, if this heuristic proves too hefty, then we might switch to slurping in a dictionary of
            # acceptable words from /usr/dict/words or whatever, and only store words that appear in that dictionary
            continue
        else:
            # this "word not in list" could perhaps be better done as a binary search
            words_of_good_length = [word for word in WORD_REGEX.findall(line) if good_length(word)]
            temp_list = [word.lower() for word in words_of_good_length]
            if skip_pronouns:
                temp_list = [word for word in temp_list if is_not_a_pronoun(word)]
            # note that if a word has a digit, and is skipped as a result, there's no sense in checking if it's
            # composed entirely of digits to skip that too
            if skip_words_with_digits:
                temp_list = [element for element in temp_list if does_not_contain_a_digit]
            elif skip_numbers:
                temp_list = [element for element in temp_list if is_not_a_number(element)]
            for word in temp_list:
                unique_dict[word] = b""
    return unique_dict.keys()


def human_readable_seconds(seconds):
    """Convert an integer with seconds to a human-readable string"""
    list_ = [(60, "s"), (60, "m"), (24, "h"), (30, "d"), (12, "y")]
    temp = int(seconds)
    result = []
    for tuple_ in list_:
        if temp <= 0:
            break
        quotient = int(temp / tuple_[0])
        remainder = temp - quotient * tuple_[0]
        temp = quotient
        result.append(str(remainder) + tuple_[1])
    result.reverse()
    if result == []:
        return " 0s"
    else:
        return " " + " ".join(result)


def null_progress(fileno, numfiles):
    """A function for displaying progress data with verbose >= 0 - IOW, don't output anything"""
    make_used(fileno)
    make_used(numfiles)


def verbose_progress(fileno, numfiles):
    """A function for displaying progress data with verbose >= 1"""
    if fileno != 0 and fileno % 1000 == 0:
        time1 = time.time()
        diff = time1 - TIME0
        filespersecond = fileno / diff
        if numfiles == 0:
            sys.stderr.write(
                "file %d, %f files/second,%s elapsed\n"
                % (
                    fileno,
                    filespersecond,
                    human_readable_seconds(diff),
                )
            )
        else:
            remainingtime = (numfiles - fileno) / filespersecond
            sys.stderr.write(
                "File %d of %d, %f%% done, %f files/second,%s elapsed,%s estimated remaining \n"
                % (
                    fileno,
                    numfiles,
                    (fileno * 1000.0 / numfiles) / 10.0,
                    filespersecond,
                    human_readable_seconds(diff),
                    human_readable_seconds(remainingtime),
                )
            )


def very_verbose_progress(fileno, numfiles):
    """A function for displaying progress data with verbose >= 2"""
    time1 = time.time()
    diff = time1 - TIME0
    if fileno != 0:
        filespersecond = fileno / diff
        if numfiles == 0:
            sys.stderr.write(
                "file %d, %f files/second,%s elapsed\n"
                % (
                    fileno,
                    filespersecond,
                    human_readable_seconds(diff),
                )
            )
        else:
            remainingtime = (numfiles - fileno) / filespersecond
            sys.stderr.write(
                "File %d of %d, %f%% done, %f files/second,%s elapsed,%s estimated remaining \n"
                % (
                    fileno,
                    numfiles,
                    (fileno * 1000.0 / numfiles) / 10.0,
                    filespersecond,
                    human_readable_seconds(diff),
                    human_readable_seconds(remainingtime),
                )
            )


def get_filenames(verbose, start_from_scratch, delimiter):
    filenames = []
    filename_no = 0
    if start_from_scratch:
        uniqueness_set = set()
        for filename in readline0.readline0(sys.stdin.fileno(), delimiter):
            uniqueness_set.add(filename)
            filename_no += 1
            if verbose and filename_no % 10000 == 0:
                sys.stderr.write("\rRead %d lines\n" % filename_no)
        filenames = list(uniqueness_set)
    else:
        for filename in readline0.readline0(sys.stdin.fileno(), delimiter):
            filenames.append(filename)
            filename_no += 1
            if verbose and filename_no % 10000 == 0:
                sys.stderr.write("\rRead %d lines\n" % filename_no)
    return filenames


def do_index(
    database,
    avoid_ascii_coded_binary,
    skip_pronouns,
    skip_numbers,
    skip_words_with_digits,
    start_from_scratch,
    delimiter,
    verbose,
):
    """Index a collection of files"""
    global TIME0
    if verbose == 0:
        progress = null_progress
    elif verbose == 1:
        progress = verbose_progress
    elif verbose >= 2:
        progress = very_verbose_progress
    else:
        raise Exception("No progress function for verbose {}".format(verbose))
    if start_from_scratch:
        # erase what's there, and open for read/write
        database.open("n")
    else:
        # open what's there read/write, creating if necessary
        database.open("c")

    if verbose:
        sys.stderr.write("Inhaling filenames\n")
    filenames = get_filenames(verbose, start_from_scratch, delimiter)
    num_filenames = len(filenames)
    if verbose:
        sys.stderr.write("Done inhaling %d filenames\n" % num_filenames)
    if verbose:
        if start_from_scratch:
            sys.stderr.write("Starting to create an all-new index\n")
        else:
            sys.stderr.write("Starting to update the potentially-preexisting index\n")
    TIME0 = time.time()
    unopenable = 0
    for filenameno, filename in enumerate(filenames):
        progress(filenameno, num_filenames)
        if verbose >= 3:
            sys.stderr.write("Indexing file %s\n" % filename)
        try:
            file_ = open(filename, "rb")
        except IOError:
            sys.stderr.write("Could not open %s\n" % filename)
            unopenable += 1
            continue
        # Note that we have nothing attempt to make sure that the words passed to database.add_filename_to_words is
        # receiving a list of unique words, which may end up making the cache work quite a bit harder in some cases -
        # Two pathological cases are:
        # 1) a file full of occurences of a single word.
        # 2) a file full of a repeating sequence of word1 ... word20000
        database.add_filename_to_words(
            filename,
            split_file_into_words(file_, avoid_ascii_coded_binary, skip_pronouns, skip_numbers, skip_words_with_digits),
        )
        file_.close()
    database.close()
    if verbose:
        sys.stderr.write("%d of %d files were unopenable\n" % (unopenable, num_filenames))


def do_search(database, dbname, keyword):
    """Search a database for a keyword"""
    database.open("r")
    if keyword in database:
        filenames = database[keyword]
        if not filenames:
            sys.stderr.write("Sorry, %s is not in database %s\n" % (keyword, dbname))
            sys.exit(1)
        else:
            for filename in filenames:
                print(filename)
    else:
        sys.stderr.write("Sorry, %s is not in the database\n" % (keyword))
        sys.exit(1)


def do_dump(database):
    """Dump a database to stdout"""
    database.open("r")
    keyno = 0
    for key in database:
        sys.stdout.write("%d %s" % (keyno, key))
        values = database[key]
        sys.stdout.write(" %d" % len(values))
        for value in values:
            sys.stdout.write(" %s" % value)
        keyno += 1
        sys.stdout.write("\n")


class filename_set_collection_class:
    """
    A filename collection class based on sets.

    This does the full-on "read string from disk, convert string to set, make your changes, convert set
    to string, write string to disk" thing, aside from the caching of course
    """

    def __init__(self, base255_strings):
        self.set_ = set()
        for base255_string in base255_strings.split(b"\0"):
            self.set_.add(base255_string)

    def files(self):
        """Return the files"""
        return list(self.set_)

    def to_binary(self):
        """Convert to a bytes representation"""
        # return string.joinfields(self.files(), '\0')
        return b"\0".join(self.files())

    def add(self, abbrv_filename):
        """Add an abbreviated filename"""
        self.set_.add(abbrv_filename)


class filename_list_collection_class:
    """A filename collection class based on sets"""

    def __init__(self, base255_strings):
        self.list_ = base255_strings.split(b"\0")

    def files(self):
        """Return the list of files"""
        return self.list_

    def to_binary(self):
        """Convert to a bytes representation"""
        return b"\0".join(self.list_)

    def add(self, abbrv_filename):
        """Add an abbreviated filename"""
        self.list_.append(abbrv_filename)


class Options:
    def __init__(self):
        self.index = False
        self.search = False
        self.dump = False
        self.dbname = ""
        self.verbose = 0
        self.avoid_ascii_coded_binary = False
        self.skip_pronouns = False
        self.skip_numbers = False
        self.skip_words_with_digits = False
        self.delimiter = b"\n"

        while sys.argv[1:]:
            if sys.argv[1] == "-d" and sys.argv[2:]:
                self.dbname = sys.argv[2]
                del sys.argv[1]
            elif sys.argv[1] == "-s" and sys.argv[2:]:
                self.search = True
                self.keyword = sys.argv[2]
                del sys.argv[1]
            elif sys.argv[1] == "-u":
                self.index = True
                self.start_from_scratch = 0
            elif sys.argv[1] == "-c":
                self.index = True
                self.start_from_scratch = True
            elif sys.argv[1] == "-D":
                self.dump = True
            elif sys.argv[1] == "-v":
                self.verbose += True
            elif sys.argv[1] == "-A":
                self.avoid_ascii_coded_binary = True
            elif sys.argv[1] == "-p":
                self.skip_pronouns = True
            elif sys.argv[1] == "-n":
                self.skip_numbers = True
            elif sys.argv[1] == "-N":
                self.skip_words_with_digits = True
            elif sys.argv[1] == "-0":
                self.delimiter = b"\0"
            elif sys.argv[1] == "-h":
                usage(0)
            else:
                usage(1)
            del sys.argv[1]

    def check_options(self):
        if self.dbname == "":
            sys.stderr.write("You must specify a database name with -d\n")
            usage(1)

        if self.index + self.search + self.dump != 1:
            sys.stderr.write(
                "You must give exactly one of -s, -c, -u or -D (internal use: %d)\n" % (self.index + self.search + self.dump)
            )
            usage(1)


def main():
    """
    Main function.

    Parse options, validate option consistency, and invoke top-level behaviors.
    """
    options = Options()
    options.check_options()

    if not options.index or (options.index and not options.start_from_scratch):
        filename_collection_class = filename_set_collection_class
    else:
        filename_collection_class = filename_list_collection_class
        # This just appends!  It's -=ONLY=- suitable for "index and start_from_scratch" mode!

    database = pyindex_class(options.dbname, filename_collection_class, options.verbose)

    if options.index:
        do_index(
            database,
            options.avoid_ascii_coded_binary,
            options.skip_pronouns,
            options.skip_numbers,
            options.skip_words_with_digits,
            options.start_from_scratch,
            options.delimiter,
            options.verbose,
        )
    elif options.search:
        lower_bytes = options.keyword.lower().encode("ISO-8859-1")
        do_search(database, options.dbname, lower_bytes)
    elif options.dump:
        do_dump(database)
    else:
        sys.stderr.write("%s: internal error\n" % sys.argv[0])
        usage(1)


main()