#!/usr/bin/env python

from argparse import ArgumentParser
from datetime import datetime
from glob import glob
from multiprocessing import Pool
from multiprocessing import cpu_count
from os import unlink
from os.path import basename
from os.path import isfile
from os.path import join
from os.path import getsize
from re import finditer
from re import search
from sys import stderr

from libarchive import file_reader

ARCHS = ['x86_64']
PKG_EXTENSIONS = ['.tar.xz', '.tar.gz', '.tar.bz2', '.tar.zst', '.tar.lrz', '.tar.lzo', '.tar.Z', '.tar.lz4', '.tar.lz']
SIG_EXT = '.sig'
DEFAULT_ARCHIVE_DIR = '/srv/archive'
DEFAULT_SOURCE_DIR = '/srv/ftp'
SOURCE_WHITELIST = ['core', 'extra', 'staging', 'testing', 'community', 'community-staging', 'community-testing', 'multilib', 'multilib-staging', 'multilib-testing', 'gnome-unstable', 'kde-unstable']
DEFAULT_KEEP_YEARS = 3


def get_sourceball(path):
    print(path)
    m = search('\d{4}/\d{2}/\d{2}', path)
    date = m.group(0)

    pkgname = basename(path)
    pkg_exts = list(filter(lambda ext: pkgname.endswith(ext), PKG_EXTENSIONS))
    if not pkg_exts:
        raise ValueError(f'Expecting path to end with one of these extensions {" ".join(PKG_EXTENSIONS)}. {path} given')
    source_ext = f'.src{pkg_exts[0]}'
    pkgver, pkgrel, _ = pkgname.split('-')[-3:]
    pkgname, _ = pkgname.split(f'-{pkgver}-{pkgrel}')

    return f'/srv/archive/repos/{date}/sources/packages/{pkgname}-{pkgver}-{pkgrel}{source_ext}'


def human_filesize(size):
    for unit in ['', 'KB', 'MB', 'GB', 'TB']:
        if size < 1024:
            break
        size /= 1024
    return f'{size:.2f} {unit}'


def read_file(archive, filename, arch):
    packages = []
    with file_reader(filename) as pkg:
        for entry in pkg:
            if entry.name == '.BUILDINFO':
                packages.extend(get_packages_from_buildinfo(archive, filename, entry, arch))
                break
    if not packages:
        print(f'WARNING: .BUILDINFO not found in package {filename}', file=stderr)
    packages.append(basename(filename))
    return packages

def get_packages_from_buildinfo(archive, filename, archive_entry, arch):
    buildinfo = b''.join(archive_entry.get_blocks()).decode()
    m = search('format = (\\d)', buildinfo)
    buildinfo_format = 0 if not m else int(m.group(1))
    installed = [m.group(1) for m in finditer(r'installed = (.+)', buildinfo)]
    print(f'INFO: found .BUILDINFO format {buildinfo_format} with {len(installed)} installed packages for {filename}', file=stderr)
    packages = []
    if buildinfo_format > 0:
        for package in installed:
            pkgfilenames = [f'{package}.pkg{ext}' for ext in PKG_EXTENSIONS]
            pkgname = '-'.join(package.split('-')[:-3])
            if not pkgname:
                print(f'ERROR: failed to parse {package} with format {buildinfo_format} needed for {filename}', file=stderr)
                continue
            directory = join(archive, 'packages', pkgname[0], pkgname)
            paths = [join(directory, pkgfilename) for pkgfilename in pkgfilenames]
            existing_paths = list(filter(lambda path: isfile(path), paths))
            if not existing_paths:
                print(f'ERROR: {package} needed for {filename} not in archive {" ".join(paths)}', file=stderr)
                continue
            # as it shouldn't be possible to have more than one extension take the first and append it
            packages.append(basename(existing_paths[0]))
        return packages
    for package in installed:
        filenames_any = [f'{package}-any.pkg{ext}' for ext in PKG_EXTENSIONS]
        filenames_arch = [f'{package}-{arch}.pkg{ext}' for ext in PKG_EXTENSIONS]
        pkgname = '-'.join(package.split('-')[:-2])
        directory = join(archive, 'packages', pkgname[0], pkgname)
        paths_any = [join(directory, filename_any) for filename_any in filenames_any]
        paths_arch = [join(directory, filename_arch) for filename_arch in filenames_arch]

        existing_paths_any = list(filter(lambda path: isfile(path), paths_any))
        existing_paths_arch = list(filter(lambda path: isfile(path), paths_arch))

        if existing_paths_any:
            # as it shouldn't be possible to have more than one extension take the first and append it
            packages.append(basename(existing_paths_any[0]))
        elif existing_paths_arch:
            packages.append(basename(existing_paths_arch[0]))
        else:
            print(f'ERROR: {package} needed for {filename} not in archive {" ".join(paths_any + paths_arch)}', file=stderr)

    return packages


def main(repo=DEFAULT_SOURCE_DIR, archive=DEFAULT_ARCHIVE_DIR, keep_years=DEFAULT_KEEP_YEARS, processes=cpu_count() * 2, delete=False):
    tasks = []
    needed = set()
    for subdir in SOURCE_WHITELIST:
        for arch in ARCHS:
            directory = join(repo, subdir, 'os', arch)
            for filename in filter(lambda filename: not filename.endswith(SIG_EXT), glob(join(directory, f'*.pkg.*'))):
                tasks.append((archive, filename, arch))

    with Pool(processes=processes) as pool:
        results = pool.starmap(read_file, tasks)

    for result in results:
        needed = needed.union(result)

    for need in needed:
        print(f'INFO: needed package: {need}')

    keep_until = datetime.now().year - keep_years
    # fix before year 3000
    years = list(filter(lambda year: year <= keep_until,
                        map(lambda path: int(basename(path)), glob(join(archive, 'repos', '2*')))))
    purge_paths = [path for paths in [glob(join(archive, 'repos', f'{year}', '*', '*')) for year in years] for path in paths]
    purge_paths = [join(path, '*', 'os', '*', '*.pkg.tar.*') for path in purge_paths] + \
                  [join(path, 'pool', '*', '*.pkg.tar.*') for path in purge_paths]

    with Pool(processes=processes) as pool:
        results = pool.map(glob, purge_paths)

    all_files = [path for paths in results for path in paths if not path.endswith(SIG_EXT)]
    purge = list(filter(lambda path: basename(path) not in needed, all_files))
    keep = list(filter(lambda path: basename(path) in needed, all_files))

    for path in keep:
        print(f'INFO: keeping file {path}')

    cleanup_size = 0
    removed = set()

    for path in purge:
        print(f'INFO: deleting file {path}')

        sourceball = get_sourceball(path)
        print(sourceball)

        if delete:
            try:
                unlink(path)
            except FileNotFoundError:
                print(f'ERROR: file not found for deletion: {path}')
            try:
                unlink(f'{path}{SIG_EXT}')
            except FileNotFoundError:
                print(f'ERROR: file not found for deletion: {path}{SIG_EXT}')
            try:
                unlink(sourceball)
            except FileNotFoundError:
                print(f'ERROR: file not found for deletion: {sourceball}')
        else:
            pkg = basename(path)
            if pkg in removed:
                continue

            try:
                cleanup_size += getsize(path)
                removed.add(pkg)
            except FileNotFoundError:
                print(f'ERROR: file not found for calculation size: {path}')
            try:
                cleanup_size += getsize(f'{path}{SIG_EXT}')
            except FileNotFoundError:
                print(f'ERROR: file not found for calculation size: {path}{SIG_EXT}')

            try:
                cleanup_size += getsize(sourceball)
            except FileNotFoundError:
                print(f'ERROR: file not found for calculation size: {sourceball}')

    cleanup_size = human_filesize(cleanup_size)
    print(f'total estimated cleanup size {cleanup_size}')


if __name__ == '__main__':
    parser = ArgumentParser(description='Archive cleaner')
    parser.add_argument('--archive', default=DEFAULT_ARCHIVE_DIR, help=f'root directory of the archive (default: {DEFAULT_ARCHIVE_DIR})')
    parser.add_argument('--repo', default=DEFAULT_SOURCE_DIR, help=f'root directory of the repo (default: {DEFAULT_SOURCE_DIR})')
    parser.add_argument('--keep-years', type=int, default=DEFAULT_KEEP_YEARS, help=f'amount of years to keep (default: {DEFAULT_KEEP_YEARS})')
    parser.add_argument('--processes', type=int, default=cpu_count() * 2, help=f'number of parallel processes (default: {cpu_count()*2})')
    parser.add_argument('--delete', default=False, action='store_true', help=f'really delete unwanted files')
    args = parser.parse_args()
    main(args.repo, args.archive, args.keep_years, args.processes, args.delete)
