#!/usr/bin/python3 """Give a histogram for a series of values.""" # https://stats.stackexchange.com/questions/235330/iles-terminology-for-the-top-half-a-percent import bisect import collections # import cProfile import decimal import itertools import math import sys import typing def make_used(*var: typing.Any) -> None: """Persuade linters that var is used.""" assert True or var def usage(retval: int) -> None: """Output a usage message.""" if retval in (0, None): write = sys.stdout.write else: write = sys.stderr.write write(f'Usage: {sys.argv[0]}\n') write('\t--bin-function specifies the function used to compute the width of the bins\n') write('\t EG: --bin-function "square root" specifies the numeric bin widths will be computed using the square root rule\n') write('\t--display-width specifies the width of the largest count (for the stars; default: 60.0)\n') write('\t--elide elides the stars (overrides --display-width)\n') write('\t--suppress-zeros says to eliminate zero counts from the report. Only relevant for numeric histograms\n') write('\t--force-string tells this program to treat numbers as strings\n') write('\t--help outputs this usage message\n') write('\t--percent gives percentages\n') write('\t--tabbed says to separate fields with tabs instead of spaces, suitable for piping to expand(1)\n') write('\t--input-file fn says to read values from file fn instead of stdin (mostly for debugging this program)\n') write('\n') write('If all values read from stdin can be converted to decimal, bin widths will be the decile of the range - or\n') write('other bin width function, as specified. Otherwise, each distinct str read from stdin will be treated as a bin.\n') write('\n') write(f"IOW, if you feed {sys.argv[0]} lots of numbers, and get one bin for each distinct number, then you\'ve probably\n") write('fed it one or more values that cannot be converted to decimal.Decimal().\n') write('\n') write('Decile choice is used for numeric binning by default.\n') write('\n') sys.stderr.write('Valid choices for bin width functions are:\n') for key in bin_fn_dict: sys.stderr.write(f'\t{key}\n') write('\n') write('Please note that the numeric binning algorithm is no longer O(n) in the number of bins. It is now a more\n') write('reasonable O(logn).\n') sys.exit(retval) def square_root_choice(entire_width: decimal.Decimal) -> int: """Compute the square root choice.""" return math.ceil(math.sqrt(entire_width)) def sturges_formula(entire_width: decimal.Decimal) -> int: """Compute Sturges' formula choice.""" return math.ceil(math.log(entire_width, 2)) + 1 def rice_rule(entire_width: decimal.Decimal) -> int: """Compute rice rule choice.""" one_third = decimal.Decimal(1) / decimal.Decimal(3) two = decimal.Decimal(2) return math.ceil(two * entire_width ** one_third) def decile_rule(entire_width: decimal.Decimal) -> int: """Compute decile rule choice.""" one_tenth = decimal.Decimal(1) / decimal.Decimal(10) return math.ceil(entire_width * one_tenth) def percentile_rule(entire_width: decimal.Decimal) -> int: """Compute percentile rule choice.""" one_hundredth = decimal.Decimal(1) / decimal.Decimal(100) return math.ceil(entire_width * one_hundredth) def quartile_rule(entire_width: decimal.Decimal) -> int: """Compute quartile rule choice.""" one_4th = decimal.Decimal(1) / decimal.Decimal(4) return math.ceil(entire_width * one_4th) def twenty_5th_rule(entire_width: decimal.Decimal) -> int: """Compute twenty fifth rule choice.""" one_25th = decimal.Decimal(1) / decimal.Decimal(25) return math.ceil(entire_width * one_25th) bin_fn_dict = { 'square root': square_root_choice, "sturges formula": sturges_formula, "rice rule": rice_rule, 'quartile': quartile_rule, 'decile': decile_rule, 'twentyfifth-ile': twenty_5th_rule, 'percentile': percentile_rule, } class Options(object): # pylint: disable=too-few-public-methods # too-few-public-methods: We're a container """Hold command line options.""" def __init__(self) -> None: """Initialize.""" self.percent = False self.elide = False self.display_width = decimal.Decimal('60.0') self.tabbed = False self.force_string = False self.separator = ' ' self.bin_function = 'decile' self.suppress_zeros = False self.input_filename = None while sys.argv[1:]: if sys.argv[1:] and sys.argv[1] == '--percent': self.percent = True elif sys.argv[1:] and sys.argv[1] == '--tabbed': self.tabbed = True self.separator = '\t' elif sys.argv[1:] and sys.argv[1] == '--elide': self.elide = True elif sys.argv[1:] and sys.argv[1] == '--suppress-zeros': self.suppress_zeros = True elif sys.argv[1:] and sys.argv[1] == '--force-string': self.force_string = True elif sys.argv[2:] and sys.argv[1] == '--display-width': self.display_width = decimal.Decimal(sys.argv[2]) if self.display_width <= 0.0: sys.stderr.write('--display-width argument must be > 0.0\n') sys.exit(1) del sys.argv[1] elif sys.argv[2:] and sys.argv[1] == '--bin-function': self.bin_function = sys.argv[2] if self.bin_function not in bin_fn_dict: sys.stderr.write('unrecognized --bin-function argument\n') usage(1) del sys.argv[1] elif sys.argv[2:] and sys.argv[1] == '--input-file': self.input_filename = sys.argv[2] del sys.argv[1] elif sys.argv[1] in ('-h', '--help'): usage(0) else: sys.stderr.write(f'{sys.argv[0]}: Option {sys.argv[1]} is unrecognized\n\n') usage(1) del sys.argv[1] def is_numeric(possible_number: str) -> bool: """Return True iff possible_number can be converted to decimal.Decimal.""" try: decimal.Decimal(possible_number) except (ValueError, decimal.InvalidOperation): return False return True def left_and_right_to_str(left_value: decimal.Decimal, right_value: decimal.Decimal, *, equal: bool = False) -> str: """Convert a left, right pair into a meaningful str.""" if equal: operator = '<=' else: operator = '<' # FIXME: We eliminate a lot of our precision here, for concision in the output. It might be nice to make this # configurable someday. But not today. return f'{left_value} <= x {operator} {right_value}' def get_key(bins: typing.List[decimal.Decimal], number: decimal.Decimal): """Derive a key from a line of input.""" # FIXME: This is slow (it is O(n^2) in the number of bins), but should be more accurate in the face of rounding errors than # binary search. if len(bins) == 1: left_value = bins[0] right_value = left_value equal = True elif len(bins) >= 2 and number < bins[-2]: insertion_point = bisect.bisect_left(bins, number) left_index = insertion_point if bins[left_index] > number: left_index -= 1 right_index = left_index + 1 left_value = bins[left_index] right_value = bins[right_index] assert left_value <= number < right_value, ( 'not left_value <= number < right_value' f'\nbins: {bins}\n' f'left_index: {left_index}\nright_index: {right_index}\n' f'left_value: {left_value}\nnumber: {number}\nright_value: {right_value}' ) equal = False else: left_index = len(bins) - 2 right_index = left_index + 1 left_value = bins[left_index] right_value = bins[right_index] equal = True assert left_value <= number <= right_value return left_and_right_to_str(left_value, right_value, equal=equal) def get_dict_numeric(lines: typing.List[str], bin_fn_str: str): """Get a dictionary of decimal values.""" dict_: typing.Dict[str, int] = collections.defaultdict(int) numbers = [decimal.Decimal(line) for line in lines] min_value = min(numbers) max_value = max(numbers) entire_width = max_value - min_value bin_fn = bin_fn_dict[bin_fn_str] bin_width = bin_fn(entire_width) bins = [] value = min_value while value < max_value: bins.append(value) value += bin_width bins.append(max_value) bin_descriptions = [] # Do all but the last value for left_value, right_value in itertools.zip_longest(bins[:-2], bins[1:-1]): bin_descriptions.append(left_and_right_to_str(left_value, right_value)) # Now add just the last value if len(bins) == 1: left_value = bins[0] right_value = left_value else: left_value = bins[-2] right_value = bins[-1] bin_descriptions.append(left_and_right_to_str(left_value, right_value, equal=True)) for number in numbers: key = get_key(bins, number) dict_[key] += 1 return (bin_descriptions, dict_) def get_dict_str(lines: typing.List[str]): """Get a dictionary of decimal values.""" dict_: typing.Dict[str, int] = collections.defaultdict(int) bin_descriptions = sorted(set(lines)) for line in lines: dict_[line] += 1 return (bin_descriptions, dict_) def get_dict( input_filename: typing.Optional[str], force_string: bool, bin_fn_str: str, ) -> typing.Tuple[typing.List[str], typing.Dict[str, int]]: """Read dict from stdin, bin'ing as we go.""" # It's a little confusing: we use bin_fn_str to get the actual function from bin_fn_dict - but for numeric binning, not # string binning. if input_filename is None: lines = [x.rstrip('\n') for x in sys.stdin] else: with open(input_filename, 'r') as file_: lines = [x.rstrip('\n') for x in file_] if len(lines) == 0: return ([], {}) if all(is_numeric(value) for value in lines) and not force_string: (bin_descriptions, dict_) = get_dict_numeric(lines, bin_fn_str) else: (bin_descriptions, dict_) = get_dict_str(lines) return (bin_descriptions, dict_) def output_percent(value: int, total: int, separator: str) -> None: """Output a percentage.""" quotient = decimal.Decimal(value) / total * decimal.Decimal('1000.0') over_ten = round(quotient) / decimal.Decimal('10') sys.stdout.write(f'{over_ten:5.1f}%{separator}') def output_key_and_value(key: str, value: int, max_key_len: int, max_value_len: int, options: Options): """Output the key and value, suitably padded.""" left = str(value) right = key if not options.tabbed: left = left.rjust(max_value_len) right = right.ljust(max_key_len) sys.stdout.write(f'{left}{options.separator}{right}') def output_stars(dict_, key, scale, separator) -> None: """Output the stars, to give a sense of scale.""" sys.stdout.write(separator) for repetition in range(0, int(round(dict_[key] * scale))): make_used(repetition) sys.stdout.write('*') def main() -> None: """Give a histogram for a series of values - main function.""" # This is a nod to arithmetic errors. We hope to make them less likely by using highly precise arithmetic decimal.getcontext().prec = 100 options = Options() (bin_descriptions, dict_) = get_dict(options.input_filename, options.force_string, options.bin_function) if not dict_: sys.stderr.write(f'{sys.argv[0]}: empty stdin\n') sys.exit(0) if options.percent: total = 0 for key in dict_: total = total + dict_[key] max_count = max(dict_.values()) scale = options.display_width / max_count max_key_len = max(len(key) for key in dict_) max_value_len = max(len(str(value)) for value in dict_.values()) for key in bin_descriptions: value = dict_[key] if options.suppress_zeros and value == 0: continue if options.percent: output_percent(value, total, options.separator) output_key_and_value(key, value, max_key_len, max_value_len, options) if not options.elide: output_stars(dict_, key, scale, options.separator) sys.stdout.write('\n') # cProfile.run('main()') main()