#!/usr/bin/python3

"""Give a histogram for a series of values."""

# https://stats.stackexchange.com/questions/235330/iles-terminology-for-the-top-half-a-percent

import bisect
import collections
# import cProfile
import decimal
import itertools
import math
import sys
import typing


def make_used(*var: typing.Any) -> None:
    """Persuade linters that var is used."""
    assert True or var


def usage(retval: int) -> None:
    """Output a usage message."""
    if retval in (0, None):
        write = sys.stdout.write
    else:
        write = sys.stderr.write

    write(f'Usage: {sys.argv[0]}\n')
    write('\t--bin-function specifies the function used to compute the width of the bins\n')
    write('\t   EG: --bin-function "square root" specifies the numeric bin widths will be computed using the square root rule\n')
    write('\t--display-width specifies the width of the largest count (for the stars; default: 60.0)\n')
    write('\t--elide elides the stars (overrides --display-width)\n')
    write('\t--suppress-zeros says to eliminate zero counts from the report.  Only relevant for numeric histograms\n')
    write('\t--force-string tells this program to treat numbers as strings\n')
    write('\t--help outputs this usage message\n')
    write('\t--percent gives percentages\n')
    write('\t--tabbed says to separate fields with tabs instead of spaces, suitable for piping to expand(1)\n')
    write('\t--input-file fn says to read values from file fn instead of stdin (mostly for debugging this program)\n')
    write('\n')
    write('If all values read from stdin can be converted to decimal, bin widths will be the decile of the range - or\n')
    write('other bin width function, as specified.  Otherwise, each distinct str read from stdin will be treated as a bin.\n')
    write('\n')
    write(f"IOW, if you feed {sys.argv[0]} lots of numbers, and get one bin for each distinct number, then you\'ve probably\n")
    write('fed it one or more values that cannot be converted to decimal.Decimal().\n')
    write('\n')
    write('Decile choice is used for numeric binning by default.\n')
    write('\n')
    sys.stderr.write('Valid choices for bin width functions are:\n')
    for key in bin_fn_dict:
        sys.stderr.write(f'\t{key}\n')
    write('\n')
    write('Please note that the numeric binning algorithm is no longer O(n) in the number of bins.  It is now a more\n')
    write('reasonable O(logn).\n')
    sys.exit(retval)


def square_root_choice(entire_width: decimal.Decimal) -> int:
    """Compute the square root choice."""
    return math.ceil(math.sqrt(entire_width))


def sturges_formula(entire_width: decimal.Decimal) -> int:
    """Compute Sturges' formula choice."""
    return math.ceil(math.log(entire_width, 2)) + 1


def rice_rule(entire_width: decimal.Decimal) -> int:
    """Compute rice rule choice."""
    one_third = decimal.Decimal(1) / decimal.Decimal(3)
    two = decimal.Decimal(2)
    return math.ceil(two * entire_width ** one_third)


def decile_rule(entire_width: decimal.Decimal) -> int:
    """Compute decile rule choice."""
    one_tenth = decimal.Decimal(1) / decimal.Decimal(10)
    return math.ceil(entire_width * one_tenth)


def percentile_rule(entire_width: decimal.Decimal) -> int:
    """Compute percentile rule choice."""
    one_hundredth = decimal.Decimal(1) / decimal.Decimal(100)
    return math.ceil(entire_width * one_hundredth)


def quartile_rule(entire_width: decimal.Decimal) -> int:
    """Compute quartile rule choice."""
    one_4th = decimal.Decimal(1) / decimal.Decimal(4)
    return math.ceil(entire_width * one_4th)


def twenty_5th_rule(entire_width: decimal.Decimal) -> int:
    """Compute twenty fifth rule choice."""
    one_25th = decimal.Decimal(1) / decimal.Decimal(25)
    return math.ceil(entire_width * one_25th)


bin_fn_dict = {
    'square root': square_root_choice,
    "sturges formula": sturges_formula,
    "rice rule": rice_rule,
    'quartile': quartile_rule,
    'decile': decile_rule,
    'twentyfifth-ile': twenty_5th_rule,
    'percentile': percentile_rule,
}


class Options(object):
    # pylint: disable=too-few-public-methods
    # too-few-public-methods: We're a container
    """Hold command line options."""

    def __init__(self) -> None:
        """Initialize."""
        self.percent = False
        self.elide = False
        self.display_width = decimal.Decimal('60.0')
        self.tabbed = False
        self.force_string = False
        self.separator = ' '
        self.bin_function = 'decile'
        self.suppress_zeros = False
        self.input_filename = None
        while sys.argv[1:]:
            if sys.argv[1:] and sys.argv[1] == '--percent':
                self.percent = True
            elif sys.argv[1:] and sys.argv[1] == '--tabbed':
                self.tabbed = True
                self.separator = '\t'
            elif sys.argv[1:] and sys.argv[1] == '--elide':
                self.elide = True
            elif sys.argv[1:] and sys.argv[1] == '--suppress-zeros':
                self.suppress_zeros = True
            elif sys.argv[1:] and sys.argv[1] == '--force-string':
                self.force_string = True
            elif sys.argv[2:] and sys.argv[1] == '--display-width':
                self.display_width = decimal.Decimal(sys.argv[2])
                if self.display_width <= 0.0:
                    sys.stderr.write('--display-width argument must be > 0.0\n')
                    sys.exit(1)
                del sys.argv[1]
            elif sys.argv[2:] and sys.argv[1] == '--bin-function':
                self.bin_function = sys.argv[2]
                if self.bin_function not in bin_fn_dict:
                    sys.stderr.write('unrecognized --bin-function argument\n')
                    usage(1)
                del sys.argv[1]
            elif sys.argv[2:] and sys.argv[1] == '--input-file':
                self.input_filename = sys.argv[2]
                del sys.argv[1]
            elif sys.argv[1] in ('-h', '--help'):
                usage(0)
            else:
                sys.stderr.write(f'{sys.argv[0]}: Option {sys.argv[1]} is unrecognized\n\n')
                usage(1)
            del sys.argv[1]


def is_numeric(possible_number: str) -> bool:
    """Return True iff possible_number can be converted to decimal.Decimal."""
    try:
        decimal.Decimal(possible_number)
    except (ValueError, decimal.InvalidOperation):
        return False
    return True


def left_and_right_to_str(left_value: decimal.Decimal, right_value: decimal.Decimal, *, equal: bool = False) -> str:
    """Convert a left, right pair into a meaningful str."""
    if equal:
        operator = '<='
    else:
        operator = '<'

    # FIXME: We eliminate a lot of our precision here, for concision in the output.  It might be nice to make this
    # configurable someday.  But not today.
    return f'{left_value} <= x {operator} {right_value}'


def get_key(bins: typing.List[decimal.Decimal], number: decimal.Decimal):
    """Derive a key from a line of input."""
    # FIXME: This is slow (it is O(n^2) in the number of bins), but should be more accurate in the face of rounding errors than
    # binary search.
    if len(bins) == 1:
        left_value = bins[0]
        right_value = left_value
        equal = True
    elif len(bins) >= 2 and number < bins[-2]:
        insertion_point = bisect.bisect_left(bins, number)
        left_index = insertion_point
        if bins[left_index] > number:
            left_index -= 1
        right_index = left_index + 1
        left_value = bins[left_index]
        right_value = bins[right_index]
        assert left_value <= number < right_value, (
            'not left_value <= number < right_value'
            f'\nbins: {bins}\n'
            f'left_index: {left_index}\nright_index: {right_index}\n'
            f'left_value: {left_value}\nnumber: {number}\nright_value: {right_value}'
        )
        equal = False
    else:
        left_index = len(bins) - 2
        right_index = left_index + 1
        left_value = bins[left_index]
        right_value = bins[right_index]
        equal = True
        assert left_value <= number <= right_value
    return left_and_right_to_str(left_value, right_value, equal=equal)


def get_dict_numeric(lines: typing.List[str], bin_fn_str: str):
    """Get a dictionary of decimal values."""
    dict_: typing.Dict[str, int] = collections.defaultdict(int)
    numbers = [decimal.Decimal(line) for line in lines]
    min_value = min(numbers)
    max_value = max(numbers)
    entire_width = max_value - min_value
    bin_fn = bin_fn_dict[bin_fn_str]
    bin_width = bin_fn(entire_width)
    bins = []
    value = min_value
    while value < max_value:
        bins.append(value)
        value += bin_width
    bins.append(max_value)
    bin_descriptions = []
    # Do all but the last value
    for left_value, right_value in itertools.zip_longest(bins[:-2], bins[1:-1]):
        bin_descriptions.append(left_and_right_to_str(left_value, right_value))
    # Now add just the last value
    if len(bins) == 1:
        left_value = bins[0]
        right_value = left_value
    else:
        left_value = bins[-2]
        right_value = bins[-1]
    bin_descriptions.append(left_and_right_to_str(left_value, right_value, equal=True))
    for number in numbers:
        key = get_key(bins, number)
        dict_[key] += 1
    return (bin_descriptions, dict_)


def get_dict_str(lines: typing.List[str]):
    """Get a dictionary of decimal values."""
    dict_: typing.Dict[str, int] = collections.defaultdict(int)
    bin_descriptions = sorted(set(lines))
    for line in lines:
        dict_[line] += 1
    return (bin_descriptions, dict_)


def get_dict(
        input_filename: typing.Optional[str],
        force_string: bool,
        bin_fn_str: str,
) -> typing.Tuple[typing.List[str], typing.Dict[str, int]]:
    """Read dict from stdin, bin'ing as we go."""
    # It's a little confusing: we use bin_fn_str to get the actual function from bin_fn_dict - but for numeric binning, not
    # string binning.
    if input_filename is None:
        lines = [x.rstrip('\n') for x in sys.stdin]
    else:
        with open(input_filename, 'r') as file_:
            lines = [x.rstrip('\n') for x in file_]

    if len(lines) == 0:
        return ([], {})
    if all(is_numeric(value) for value in lines) and not force_string:
        (bin_descriptions, dict_) = get_dict_numeric(lines, bin_fn_str)
    else:
        (bin_descriptions, dict_) = get_dict_str(lines)
    return (bin_descriptions, dict_)


def output_percent(value: int, total: int, separator: str) -> None:
    """Output a percentage."""
    quotient = decimal.Decimal(value) / total * decimal.Decimal('1000.0')
    over_ten = round(quotient) / decimal.Decimal('10')
    sys.stdout.write(f'{over_ten:5.1f}%{separator}')


def output_key_and_value(key: str, value: int, max_key_len: int, max_value_len: int, options: Options):
    """Output the key and value, suitably padded."""
    left = str(value)
    right = key

    if not options.tabbed:
        left = left.rjust(max_value_len)
        right = right.ljust(max_key_len)

    sys.stdout.write(f'{left}{options.separator}{right}')


def output_stars(dict_, key, scale, separator) -> None:
    """Output the stars, to give a sense of scale."""
    sys.stdout.write(separator)
    for repetition in range(0, int(round(dict_[key] * scale))):
        make_used(repetition)
        sys.stdout.write('*')


def main() -> None:
    """Give a histogram for a series of values - main function."""
    # This is a nod to arithmetic errors.  We hope to make them less likely by using highly precise arithmetic
    decimal.getcontext().prec = 100

    options = Options()
    (bin_descriptions, dict_) = get_dict(options.input_filename, options.force_string, options.bin_function)

    if not dict_:
        sys.stderr.write(f'{sys.argv[0]}: empty stdin\n')
        sys.exit(0)

    if options.percent:
        total = 0
        for key in dict_:
            total = total + dict_[key]

    max_count = max(dict_.values())
    scale = options.display_width / max_count

    max_key_len = max(len(key) for key in dict_)
    max_value_len = max(len(str(value)) for value in dict_.values())

    for key in bin_descriptions:
        value = dict_[key]
        if options.suppress_zeros and value == 0:
            continue
        if options.percent:
            output_percent(value, total, options.separator)

        output_key_and_value(key, value, max_key_len, max_value_len, options)

        if not options.elide:
            output_stars(dict_, key, scale, options.separator)
        sys.stdout.write('\n')


# cProfile.run('main()')
main()