#!/usr/bin/env bash

# Reclaim storage from a backshift repo, by deleting files older than a user-specified threshold

set -eu
set -o pipefail > /dev/null 2>&1 || true

save_directory=""
days=""
# BTW, we intentionally don't quote $verbose throughout, because we don't want it to expand to a null string when
# empty.
verbose=""

function usage
{
	retval="$1"
	(
	echo "Usage: $0 --save-directory /save/directory --days num_days_to_keep -v --help"
	echo "Traverse /save/directory looking for storage that can be reclaimed"
	echo
	echo "   --save-directory /r/e/p/o    specify where the repository is"
	echo "   --days days                  eliminate saveset summaries and chunks older than days."
	echo "                                0 means expire everything."
	echo "                                1 means expire things 1 day old or more."
	echo "   --verbose                    operate in verbose mode.  Also -v"
	echo "   --help                       output this message.  Also -h"
	) 1>&2
	exit "$retval"
}

while [ "$#" -ge 1 ]
do
	case "$1" in
		--save-directory)
			save_directory="$2"
			shift
			;;
		--days)
			days="$2"
			shift
			;;
		--verbose|-v)
			verbose="-v"
			;;
		--help|-h)
			usage 0
			;;
		*)
			echo "$0: Illegal option: $1" 1>&2
			usage 1
			;;
	esac
	shift
done

if [ "$save_directory" = "" ]
then
	echo "$0: --save-directory is a required option" 1>&2
	usage 1
fi

if [ "$days" = "" ]
then
	echo "$0: --days is a required option" 1>&2
	usage 1
fi

# We don't really _need_ to chdir here - but it's a good way to test for a directory or a symlink pointing at a directory.
# Hence the subshell; that way, we chdir, but the change is immediately lost.
if ! (set -eu; cd "$save_directory" > /dev/null 2>&1)
then
	echo "$0: $save_directory does not exist" 1>&2
	usage 1
fi

# Deal with summaries and files directories
seconds_old=$(echo "$days" \* 24 \* 60 \* 60 | bc)
time_now=$(python -c 'import time; print(int(time.time()))')
threshold_seconds=$(echo "$time_now" - "$seconds_old" | bc)
if ! find "$save_directory"/summaries -maxdepth 1 -name '*_*_*_*_*' -print | \
	while read summaries_filename
	do
		set -eu
		set -o pipefail > /dev/null 2>&1 || true

		files_directory="$save_directory/files/""$(echo $summaries_filename | sed 's#^.*/\([^/]*\)$#\1#')"

		use_mtime=False
		finish_time=$(egrep '^finish_time\>' "$summaries_filename" | awk ' { print $2 }' | sed 's/\..*$//')
		case "$finish_time" in
			None|"")
				# No finish_time was defined, so use the modification time of the summaries file instead.
				use_mtime=True
				;;
		esac

		if [ "$use_mtime" = True ]
		then
			finish_time=$(python -c 'import os; print(os.stat("'$summaries_filename'").st_mtime)' | sed 's/\..*$//')
		fi
			
		# This needs to be an integer comparison, hence the truncation(s) above.
		if [ "$finish_time" -lt "$threshold_seconds" ]
		then
			if ! rm $verbose "$summaries_filename"
			then
				echo "$0: rm $summaries_filename failed" 1>&2
				exit 1
			fi
			if ! rm $verbose -rf "$files_directory"
			then
				echo "$0: rm $files_directory failed" 1>&2
				exit 1
			fi
		fi
	done
then
	echo "$0: Expiring summary files failed" 1>&2
	exit 1
fi

# Now deal with the chunk files, which is where most of the storage is.
#
# This is the part that takes quite a while.
#
# This comes pretty close to what we really want - except it won't remove the recently-emptied parent of an empty child - that is,
# not until the next time this script is run.  If we used -exec instead of xargs it might, but that'd be much slower for large
# repositories (save directories).
days_minus_1=$(($days - 1))
find "$save_directory"/chunks -mindepth 1 \( \( -type f -mtime +"$days_minus_1" -print0 \) -o \( -type d -empty -print0 \) \) | \
	xargs -0 rm $verbose -rf