#!/usr/bin/env python3

import json
import pathlib
import re
import typing

import bs4
import requests

BIRTHDAYS_JSON = pathlib.Path('mco-birthdays.json')

USERS_URL_TEMPLATE = 'https://minecraftonline.com/w/index.php?title=Special:ListUsers&limit={limit}&offset={offset}'

SAVE_CHUNK_SIZE = 10

BIRTHDAY_REGEXES = {
    part: re.compile(f'birthday{part}\s*=\s*([^|]+)')
    for part in ('year', 'month', 'day')
}

USER_REGEX = re.compile('User:([^&]+)')

OFFSET_REGEX = re.compile('offset=([^&]+)')

T_Birthday = dict[str, int]

MONTHS = (
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august',
    'september', 'october', 'november', 'december'
)


def find_month(query: str) -> typing.Optional[int]:
    # very inefficient but quick to write
    for i, month in enumerate(MONTHS):
        if month[:len(query)] == query.lower():
            return i + 1


def extract_birthday(source: typing.Union[str, bytes]) -> T_Birthday:
    result = dict()
    if type(source) is bytes:
        source = source.decode()
    for part, regex in BIRTHDAY_REGEXES.items():
        if match := regex.search(source):
            match_result = match.group(1).strip()
            if match_result.isnumeric():
                result[part] = int(match_result)
            elif month := find_month(match_result):
                result[part] = month
    return result


def retrieve_birthday(username: str) -> typing.Optional[T_Birthday]:
    url = f'https://minecraftonline.com/w/index.php?title=User:{username}&action=edit'
    print(f'retrieving {url}')
    response = requests.get(url)
    try:
        response.raise_for_status()
    except requests.exceptions.HTTPError:
        return
    return extract_birthday(response.content)


def retrieve_users_with_pages(limit_per_page: int = 500) -> list[str]:
    offset = ''
    results = []
    while True:
        url = USERS_URL_TEMPLATE.format(limit=limit_per_page, offset=offset)
        print(f'retrieving {url}')
        response = requests.get(url)
        response.raise_for_status()
        soup = bs4.BeautifulSoup(response.content, 'lxml')
        results += [
            USER_REGEX.search(a['href']).group(1)
            for a in soup.select('a.mw-userlink:not(.new)')
        ]
        next_link = soup.select_one(".mw-nextlink")
        if next_link:
            offset = OFFSET_REGEX.search(next_link['href']).group(1)
        else:
            break
    return results


def main():
    birthdays = dict()
    if BIRTHDAYS_JSON.is_file():
        with BIRTHDAYS_JSON.open('r') as fp:
            birthdays = json.load(fp)

    users_with_pages = retrieve_users_with_pages()  # abt 40 requests
    users_to_retrieve = sorted(set(users_with_pages) - set(birthdays.keys()))
    print(f'{len(users_to_retrieve)} birthdays to attempt to retrieve ({len(birthdays)} already saved)')

    for i, username in enumerate(users_to_retrieve):
        result = retrieve_birthday(username)
        if len(result) > 0:
            print(result)
            birthdays[username] = retrieve_birthday(username)
        if i % SAVE_CHUNK_SIZE == 0:
            with BIRTHDAYS_JSON.open('w') as fp:
                json.dump(birthdays, fp, indent=2)

    with BIRTHDAYS_JSON.open('w') as fp:
        json.dump(birthdays, fp, indent=2)


if __name__ == '__main__':
    main()
