#!/usr/bin/env python3
# SPDX-License-Identifier: BlueOak-1.0.0

import io
import sys
import anyio
import aiohttp
import platform
import contextlib
from yarl import URL
import importlib.util
from bs4 import BeautifulSoup
from dataclasses import dataclass
from typing import Iterable, Union

HAVE_LXML = bool(importlib.util.find_spec('lxml'))

USER_AGENT = '; '.join((
	sys.argv[0],
	'aiohttp/' + aiohttp.__version__,
	f'{platform.python_implementation()}/{platform.python_version()}',
))

@dataclass
class MailingList:
	"""usually the same as the local part of the list's email address"""
	name: str
	"""links to all the full text archives of the list"""
	text_urls: Iterable[Union[URL, str]]

async def amain():
	archive_page_url = sys.argv[1]
	async with \
		aiohttp.ClientSession(headers={'User-Agent': USER_AGENT}) as http, \
		anyio.create_task_group() as tg \
	:
		async with http.get(archive_page_url) as resp:
			soup = BeautifulSoup(await resp.text(), 'lxml' if HAVE_LXML else 'html.parser')

		list = parse_mailing_list(resp.url, soup)
		output_dir = anyio.Path(resp.url.host) / list.name
		await output_dir.mkdir(exist_ok=True)
		for url in list.text_urls:
			tg.start_soon(fetch, http, output_dir, url)

	print(file=sys.stderr)

def parse_mailing_list(url, soup):
	title = soup.find('title').text
	# format: The <name> Archives
	title = title.removeprefix('The ').removesuffix(' Archives')
	# use .join instead of / in case the href is absolute 🙄
	# weird that pathlib supports `absolute / absolute` but yarl doesn't
	return MailingList(title, text_urls=(url.join(URL(a.attrs['href'])) for a in soup.select('td:last-child a')))

async def fetch(http, output_dir, link):
	async with \
		await (out_path := output_dir / link.name).open('ab') as outf, \
		http.get(link) as resp \
	:
		with contextlib.suppress(KeyError):
			if int(resp.headers['Content-Length']) == (await out_path.stat()).st_size:
				# we already have the whole file
				return

		await outf.seek(0)
		await outf.truncate()  # download resumption is not supported yet
		await acopyfile(resp.content, outf)

	print('.', end='', file=sys.stderr, flush=True)

async def acopyfile(inf, outf, buf_size=io.DEFAULT_BUFFER_SIZE):
	while (chunk := await inf.read(buf_size)):
		await outf.write(chunk)

def main(): anyio.run(amain)

if __name__ == '__main__':
	main()
