#!/usr/bin/python3 """Index a collection of filenames retrieved from stdin""" import re import sys import time sys.path.append("/usr/local/lib") import base255 import cachedb import readline0 DBNUMBER = 1 TIME0 = -1 if DBNUMBER == 1: import dbm.gnu DBTYPE = dbm.gnu elif DBNUMBER == 2: # this one appears to be the fastest by far, at least at around 4000 messages. raise ValueError("Python 3.x no longer builds dbm.bsd out of the box") elif DBNUMBER == 3: raise ValueError("DBNUMBER 3 no longer supported") # import dbm.bsd # DBTYPE = dbm.bsd elif DBNUMBER == 4: raise ValueError("DBNUMBER 4 no longer supported") # import dbm # DBTYPE = dbm elif DBNUMBER == 5: raise ValueError("DBNUMBER 5 no longer supported") # import anydbm # DBTYPE = anydbm elif DBNUMBER == 6: raise ValueError("DBNUMBER 6 no longer supported") # import dumbdbm # DBTYPE = dumbdbm else: sys.stderr.write("Sorry, illegal database number selected\n") sys.exit(1) def make_used(*variables): """Convince linter that 'variables' are used.""" assert True or variables def usage(retval): """Output a usage message""" sys.stderr.write("Usage: %s -d databasename [-u] [-c] [-s keyword] [-D] [-v]\n" % sys.argv[0]) sys.stderr.write("-u says to update the database with all files on stdin\n") sys.stderr.write("-c says to start a new database with all files on stdin\n") sys.stderr.write("-s keyword says to list all files containing keyword\n") sys.stderr.write("-D says to dump the database contents to stdout (mostly useful for debugging)\n") sys.stderr.write("-v says to operate verbosely (can be repeated)\n") sys.stderr.write("-A says to use a heuristic to skip lines that look like base64/uuencode/binhex or similar\n") sys.stderr.write("-p says to skip indexing pronouns\n") sys.stderr.write("-n says to skip indexing numbers (all decimal digits)\n") sys.stderr.write("-N says to skip indexing words containing any digits\n") sys.stderr.write("-0 says filename list on stdin will be null terminated, not newline terminated\n") sys.stderr.write("\n") sys.stderr.write("In any case -d is required\n") sys.stderr.write("You must specify exactly one of -c, -u, -s or -D\n") sys.exit(retval) class pyindex_class: """Index a collection of files""" def __init__(self, dbname, local_filename_collection_class, verbose): self.dbname = dbname self.local_filename_collection_class = local_filename_collection_class # self.db_filename_to_abbreviation = None # self.highest_filename_abbreviation = None # self.highest_word_abbreviation = None # self.db_word_to_abbreviation = None # self.db_abbrv_word_to_abbrv_fns = None # self.db_abbreviation_to_filename = None # self.db_abbreviation_to_word = None self.mode = None self.verbose = verbose def creating_from_scratch(self): """Return True iff we're creating a database from scratch""" if self.mode == "n": return True else: return False def open(self, mode): """Open the `database`""" # 1 is good for normal use # 2 is good for testing cache behaviors, because it won't take as long to exercise cache features cache_size_set = 0 if cache_size_set == 0: expensive_item_cache_sizes = 5000000 inexpensive_item_cache_sizes = 800000 elif cache_size_set == 1: expensive_item_cache_sizes = 500000 inexpensive_item_cache_sizes = 80000 elif cache_size_set == 2: expensive_item_cache_sizes = 5000 inexpensive_item_cache_sizes = 800 else: raise SystemExit("{}: internal error: cache_size_set has an invalid value\n".format(sys.argv[0])) # we key off of this value to see if we're recreating from scratch. # it should generally be one of: # 1) r - read only # 2) w - read/write, and don't create if nonexistent # 3) c - read/write, and do create if nonexistent # 4) n - read/write, and always make an empty database, even if one already existed self.mode = mode # key: base255 abbrv_word -> value: list of base255 abbrv filenames -> on disk: a single string that comprises a # list of \0 separated base255 abbrv filenames In this we're much more than just I/O caching. We're also # keeping, in the cache, a python list (or set) of the base255 numbers so we don't have to constantly convert # the string to a list and back, which was really killing the running time of the program. Then if we need an # entry back in the cache from disk, it is automatically converted from the single-string representation back to # the python list self.db_abbrv_word_to_abbrv_fns = cachedb.Database_cache( self.dbname + ".abbrv-word-to-abbrv-filelist", DBTYPE, mode, to_string=lambda x: x.to_binary(), from_string=self.local_filename_collection_class, max_elements_in_memory=expensive_item_cache_sizes, write_through=False, too_many_percent=88.0, verbose=True, ) # key: base255 abbrv filename -> value: filename -> on disk: filename # In this we're only doing I/O caching self.db_abbreviation_to_filename = cachedb.Database_cache( self.dbname + ".abbreviation-to-filename", DBTYPE, mode, max_elements_in_memory=inexpensive_item_cache_sizes, write_through=False, verbose=True, ) # key: filename -> value: base255 filename abbreviation -> on disk: base255 filename abbreviation # In this we're only doing I/O caching self.db_filename_to_abbreviation = cachedb.Database_cache( self.dbname + ".filename-to-abbreviation", DBTYPE, mode, max_elements_in_memory=inexpensive_item_cache_sizes, write_through=False, verbose=True, ) # key: base255 abbrv word -> value: word -> on disk: word # In this we're only doing I/O caching self.db_abbreviation_to_word = cachedb.Database_cache( self.dbname + ".abbreviation-to-word", DBTYPE, mode, max_elements_in_memory=inexpensive_item_cache_sizes, write_through=False, verbose=True, ) # key: word -> value: word abbreviation as base255 -> on disk: word abbreviation as base255 # In this we're only doing I/O caching self.db_word_to_abbreviation = cachedb.Database_cache( self.dbname + ".word-to-abbreviation", DBTYPE, mode, max_elements_in_memory=inexpensive_item_cache_sizes, write_through=False, verbose=True, ) # Find the highest word and filename abbreviations. Assume that's a good place to start counting from, rather # than trying to "fill in holes" :). This means faster performance for databases that are removed and recreated # from time to time, but slower performance for databases that are created and modified and modified and # modified... self.highest_word_abbreviation = 0 self.highest_filename_abbreviation = 0 for abbrv_word in self.db_abbreviation_to_word.keys(): number = base255.base255_to_number(abbrv_word) if number > self.highest_word_abbreviation: self.highest_word_abbreviation = number for abbrv_filename in self.db_abbreviation_to_filename.keys(): number = base255.base255_to_number(abbrv_filename) if number > self.highest_filename_abbreviation: self.highest_filename_abbreviation = number def abbrv_fn_to_fn(self, abbrv_filename): """Translate an abbreviated filename to a filename""" return self.db_abbreviation_to_filename.get(abbrv_filename, b"") def fn_to_abbrv_fn(self, filename): """Translate a filename to an abbrv filename""" return self.db_filename_to_abbreviation.get(filename, b"") def abbrv_word_to_word(self, abbrv_word): """Translate an abbreviated word to a word""" # sys.stderr.write('In abbrv_word_to_word, abbrv word is %s\n' % str(abbrv_word)) # sys.stderr.write('In abbrv_word_to_word, keys are %s\n' % str(self.db_abbreviation_to_word.keys())) # return dict_get(lambda x: self.db_abbreviation_to_word[x], abbrv_word, '') return self.db_abbreviation_to_word.get(abbrv_word, b"") def word_to_abbrv_word(self, word): """Translate a word to an abbrv word""" return self.db_word_to_abbreviation.get(word, b"") def __contains__(self, word): return word in self.db_word_to_abbreviation def keys(self): """Return a list of keys in the dictionary""" abbrv_words = self.db_abbrv_word_to_abbrv_fns.keys() for abbrv_word in abbrv_words: if abbrv_word == b"": sys.stderr.write( '%s: in keys(), internal error: could not map abbreviated word "%s" to a word\n' % ( sys.argv[0], abbrv_word, ) ) sys.exit(1) yield self.db_abbreviation_to_word[abbrv_word] def __getitem__(self, word): abbreviated_word = self.word_to_abbrv_word(word) try: abbreviated_filenames = self.db_abbrv_word_to_abbrv_fns[abbreviated_word].files() except KeyError: abbreviated_filenames = [] unabbreviated_filenames = [self.abbrv_fn_to_fn(abbreviated_filename) for abbreviated_filename in abbreviated_filenames] result = [string.decode("ISO-8859-1") for string in unabbreviated_filenames] return result # return map( \ # self.abbrv_fn_to_fn, \ # self.db_abbrv_word_to_abbrv_fns.get(self.word_to_abbrv_word(word), []).files()) def add_filename_to_words(self, filename, words): """Add a filename to a list of words""" if self.verbose >= 3: sys.stderr.write("Adding words %s\n" % " ".join(words)) abbrv_filename = self.fn_to_abbrv_fn(filename) if abbrv_filename == b"": # filename abbreviation does not yet exist, set up the filename abbreviation self.highest_filename_abbreviation += 1 abbrv_filename = base255.number_to_base255(self.highest_filename_abbreviation) self.db_abbreviation_to_filename[abbrv_filename] = filename self.db_filename_to_abbreviation[filename] = abbrv_filename for word in words: abbrv_word = self.word_to_abbrv_word(word) if abbrv_word == b"": # word abbreviation does not yet exist, set up the word abbreviation self.highest_word_abbreviation += 1 abbrv_word = base255.number_to_base255(self.highest_word_abbreviation) self.db_abbreviation_to_word[abbrv_word] = word self.db_word_to_abbreviation[word] = abbrv_word if abbrv_word in self.db_abbrv_word_to_abbrv_fns: if self.verbose >= 4: sys.stderr.write("Adding filename %s to word %s\n" % (filename, word)) self.db_abbrv_word_to_abbrv_fns[abbrv_word].add(abbrv_filename) else: # An abbreviated filename list for this abbreviated word does not yet exist - create one if self.verbose >= 4: sys.stderr.write("Adding filename %s to word %s by starting a new word\n" % (filename, word)) self.db_abbrv_word_to_abbrv_fns[abbrv_word] = self.local_filename_collection_class( self.db_filename_to_abbreviation[filename], ) def close(self): """Close our databases""" self.db_abbrv_word_to_abbrv_fns.close() self.db_abbreviation_to_filename.close() self.db_filename_to_abbreviation.close() self.db_abbreviation_to_word.close() self.db_word_to_abbreviation.close() def good_length(word): """Return True iff a word is "of a good length""" return word[2:] and not word[20:] WORD_REGEX = re.compile(rb"\w+") WHITESPACE_REGEX = re.compile(rb"\s") NUMBER_REGEX = re.compile(rb"\d") PRONOUN_LIST = [ "i", "all", "another", "any", "anybody", "anyone", "anything", "both", "each", "either", "everbody", "everyone", "everything", "few", "he", "her", "hers", "herself", "him", "himself", "his", "it", "its", "itself", "many", "me", "mine", "my", "myself", "neither", "no", "nobody", "none", "nothing", "one", "one", "others", "our", "ours", "ourselves", "several", "she", "some", "somebody", "someone", "something", "that", "their", "theirs", "them", "themselves", "these", "they", "this", "this", "us", "we", "what", "which", "who", "whom", "whose", "you", "your", "yours", "yourself", "yourselves", ] # if a pronoun doesn't have a "good length", it'll never be a match anyway, so eliminate the bad length pronouns from # the list to search PRONOUNS = set(pronoun.encode("ISO-8859-1") for pronoun in PRONOUN_LIST if good_length(pronoun)) # this version uses a hash table (python "dictionary") to speed up determining if a word is a pronoun. It's probably # faster than using a binary search on a sorted array (python "list"), but I believe it's worthwhile to keep the # possibility of a binary search in reserve if the dict proves too time consuming def is_not_a_pronoun(word): """Return True iff word is not a pronoun""" if word in PRONOUNS: return False else: return True def does_not_contain_a_digit(word): """Return True iff word does not contain a digit""" if NUMBER_REGEX.search(word) is None: return True else: return False def is_not_a_number(word): """Return True iff word is not a number""" if NUMBER_REGEX.match(word) is None: return True else: return False def split_file_into_words(file, avoid_ascii_coded_binary, skip_pronouns, skip_numbers, skip_words_with_digits): """Split a file into words""" # FIXME: unique_dict should become unique_set unique_dict = {} for line in file: # \d When the UNICODE flag is not specified, matches any decimal digit; this is equivalent to the set [0-9]. # With UNICODE, it will match whatever is classified as a digit in the Unicode character properties database. # \w When the LOCALE and UNICODE flags are not specified, matches any alphanumeric character and the underscore; # this is equivalent to the set [a-zA-Z0-9_]. With LOCALE, it will match the set [0-9_] plus whatever characters # are defined as alphanumeric for the current locale. If UNICODE is set, this will match the characters [0-9_] # plus whatever is classified as alphanumeric in the Unicode character properties database. line = line.strip() if avoid_ascii_coded_binary and line[20:] and WHITESPACE_REGEX.search(line) is None: # Skip this line - it's over 20 characters long, and it contains no whitespace (aside from at its beginning # and/or end), so it's very, very likely to be just a bunch of base64 (or other binary-to-text mapping like # binhex or uuencode) gibberish that we don't need (indeed, that we don't -want-) to index (because many # people's mail is going to have a ton of this binary-to-text nonsense, and it isn't meaningful to search # through it in this sense. If we were to convert the gibberish to some sort of meaningful representation - # for example, by converting an openoffice document to text, -then- it would be reasonable to search through # -that-. But we aren't) # # someday, if this heuristic proves too hefty, then we might switch to slurping in a dictionary of # acceptable words from /usr/dict/words or whatever, and only store words that appear in that dictionary continue else: # this "word not in list" could perhaps be better done as a binary search words_of_good_length = [word for word in WORD_REGEX.findall(line) if good_length(word)] temp_list = [word.lower() for word in words_of_good_length] if skip_pronouns: temp_list = [word for word in temp_list if is_not_a_pronoun(word)] # note that if a word has a digit, and is skipped as a result, there's no sense in checking if it's # composed entirely of digits to skip that too if skip_words_with_digits: temp_list = [element for element in temp_list if does_not_contain_a_digit] elif skip_numbers: temp_list = [element for element in temp_list if is_not_a_number(element)] for word in temp_list: unique_dict[word] = b"" return unique_dict.keys() def human_readable_seconds(seconds): """Convert an integer with seconds to a human-readable string""" list_ = [(60, "s"), (60, "m"), (24, "h"), (30, "d"), (12, "y")] temp = int(seconds) result = [] for tuple_ in list_: if temp <= 0: break quotient = int(temp / tuple_[0]) remainder = temp - quotient * tuple_[0] temp = quotient result.append(str(remainder) + tuple_[1]) result.reverse() if result == []: return " 0s" else: return " " + " ".join(result) def null_progress(fileno, numfiles): """A function for displaying progress data with verbose >= 0 - IOW, don't output anything""" make_used(fileno) make_used(numfiles) def verbose_progress(fileno, numfiles): """A function for displaying progress data with verbose >= 1""" if fileno != 0 and fileno % 1000 == 0: time1 = time.time() diff = time1 - TIME0 filespersecond = fileno / diff if numfiles == 0: sys.stderr.write( "file %d, %f files/second,%s elapsed\n" % ( fileno, filespersecond, human_readable_seconds(diff), ) ) else: remainingtime = (numfiles - fileno) / filespersecond sys.stderr.write( "File %d of %d, %f%% done, %f files/second,%s elapsed,%s estimated remaining \n" % ( fileno, numfiles, (fileno * 1000.0 / numfiles) / 10.0, filespersecond, human_readable_seconds(diff), human_readable_seconds(remainingtime), ) ) def very_verbose_progress(fileno, numfiles): """A function for displaying progress data with verbose >= 2""" time1 = time.time() diff = time1 - TIME0 if fileno != 0: filespersecond = fileno / diff if numfiles == 0: sys.stderr.write( "file %d, %f files/second,%s elapsed\n" % ( fileno, filespersecond, human_readable_seconds(diff), ) ) else: remainingtime = (numfiles - fileno) / filespersecond sys.stderr.write( "File %d of %d, %f%% done, %f files/second,%s elapsed,%s estimated remaining \n" % ( fileno, numfiles, (fileno * 1000.0 / numfiles) / 10.0, filespersecond, human_readable_seconds(diff), human_readable_seconds(remainingtime), ) ) def get_filenames(verbose, start_from_scratch, delimiter): filenames = [] filename_no = 0 if start_from_scratch: uniqueness_set = set() for filename in readline0.readline0(sys.stdin.fileno(), delimiter): uniqueness_set.add(filename) filename_no += 1 if verbose and filename_no % 10000 == 0: sys.stderr.write("\rRead %d lines\n" % filename_no) filenames = list(uniqueness_set) else: for filename in readline0.readline0(sys.stdin.fileno(), delimiter): filenames.append(filename) filename_no += 1 if verbose and filename_no % 10000 == 0: sys.stderr.write("\rRead %d lines\n" % filename_no) return filenames def do_index( database, avoid_ascii_coded_binary, skip_pronouns, skip_numbers, skip_words_with_digits, start_from_scratch, delimiter, verbose, ): """Index a collection of files""" global TIME0 if verbose == 0: progress = null_progress elif verbose == 1: progress = verbose_progress elif verbose >= 2: progress = very_verbose_progress else: raise Exception("No progress function for verbose {}".format(verbose)) if start_from_scratch: # erase what's there, and open for read/write database.open("n") else: # open what's there read/write, creating if necessary database.open("c") if verbose: sys.stderr.write("Inhaling filenames\n") filenames = get_filenames(verbose, start_from_scratch, delimiter) num_filenames = len(filenames) if verbose: sys.stderr.write("Done inhaling %d filenames\n" % num_filenames) if verbose: if start_from_scratch: sys.stderr.write("Starting to create an all-new index\n") else: sys.stderr.write("Starting to update the potentially-preexisting index\n") TIME0 = time.time() unopenable = 0 for filenameno, filename in enumerate(filenames): progress(filenameno, num_filenames) if verbose >= 3: sys.stderr.write("Indexing file %s\n" % filename) try: file_ = open(filename, "rb") except IOError: sys.stderr.write("Could not open %s\n" % filename) unopenable += 1 continue # Note that we have nothing attempt to make sure that the words passed to database.add_filename_to_words is # receiving a list of unique words, which may end up making the cache work quite a bit harder in some cases - # Two pathological cases are: # 1) a file full of occurences of a single word. # 2) a file full of a repeating sequence of word1 ... word20000 database.add_filename_to_words( filename, split_file_into_words(file_, avoid_ascii_coded_binary, skip_pronouns, skip_numbers, skip_words_with_digits), ) file_.close() database.close() if verbose: sys.stderr.write("%d of %d files were unopenable\n" % (unopenable, num_filenames)) def do_search(database, dbname, keyword): """Search a database for a keyword""" database.open("r") if keyword in database: filenames = database[keyword] if not filenames: sys.stderr.write("Sorry, %s is not in database %s\n" % (keyword, dbname)) sys.exit(1) else: for filename in filenames: print(filename) else: sys.stderr.write("Sorry, %s is not in the database\n" % (keyword)) sys.exit(1) def do_dump(database): """Dump a database to stdout""" database.open("r") keyno = 0 for key in database: sys.stdout.write("%d %s" % (keyno, key)) values = database[key] sys.stdout.write(" %d" % len(values)) for value in values: sys.stdout.write(" %s" % value) keyno += 1 sys.stdout.write("\n") class filename_set_collection_class: """ A filename collection class based on sets. This does the full-on "read string from disk, convert string to set, make your changes, convert set to string, write string to disk" thing, aside from the caching of course """ def __init__(self, base255_strings): self.set_ = set() for base255_string in base255_strings.split(b"\0"): self.set_.add(base255_string) def files(self): """Return the files""" return list(self.set_) def to_binary(self): """Convert to a bytes representation""" # return string.joinfields(self.files(), '\0') return b"\0".join(self.files()) def add(self, abbrv_filename): """Add an abbreviated filename""" self.set_.add(abbrv_filename) class filename_list_collection_class: """A filename collection class based on sets""" def __init__(self, base255_strings): self.list_ = base255_strings.split(b"\0") def files(self): """Return the list of files""" return self.list_ def to_binary(self): """Convert to a bytes representation""" return b"\0".join(self.list_) def add(self, abbrv_filename): """Add an abbreviated filename""" self.list_.append(abbrv_filename) class Options: def __init__(self): self.index = False self.search = False self.dump = False self.dbname = "" self.verbose = 0 self.avoid_ascii_coded_binary = False self.skip_pronouns = False self.skip_numbers = False self.skip_words_with_digits = False self.delimiter = b"\n" while sys.argv[1:]: if sys.argv[1] == "-d" and sys.argv[2:]: self.dbname = sys.argv[2] del sys.argv[1] elif sys.argv[1] == "-s" and sys.argv[2:]: self.search = True self.keyword = sys.argv[2] del sys.argv[1] elif sys.argv[1] == "-u": self.index = True self.start_from_scratch = 0 elif sys.argv[1] == "-c": self.index = True self.start_from_scratch = True elif sys.argv[1] == "-D": self.dump = True elif sys.argv[1] == "-v": self.verbose += True elif sys.argv[1] == "-A": self.avoid_ascii_coded_binary = True elif sys.argv[1] == "-p": self.skip_pronouns = True elif sys.argv[1] == "-n": self.skip_numbers = True elif sys.argv[1] == "-N": self.skip_words_with_digits = True elif sys.argv[1] == "-0": self.delimiter = b"\0" elif sys.argv[1] == "-h": usage(0) else: usage(1) del sys.argv[1] def check_options(self): if self.dbname == "": sys.stderr.write("You must specify a database name with -d\n") usage(1) if self.index + self.search + self.dump != 1: sys.stderr.write( "You must give exactly one of -s, -c, -u or -D (internal use: %d)\n" % (self.index + self.search + self.dump) ) usage(1) def main(): """ Main function. Parse options, validate option consistency, and invoke top-level behaviors. """ options = Options() options.check_options() if not options.index or (options.index and not options.start_from_scratch): filename_collection_class = filename_set_collection_class else: filename_collection_class = filename_list_collection_class # This just appends! It's -=ONLY=- suitable for "index and start_from_scratch" mode! database = pyindex_class(options.dbname, filename_collection_class, options.verbose) if options.index: do_index( database, options.avoid_ascii_coded_binary, options.skip_pronouns, options.skip_numbers, options.skip_words_with_digits, options.start_from_scratch, options.delimiter, options.verbose, ) elif options.search: lower_bytes = options.keyword.lower().encode("ISO-8859-1") do_search(database, options.dbname, lower_bytes) elif options.dump: do_dump(database) else: sys.stderr.write("%s: internal error\n" % sys.argv[0]) usage(1) main()