#!/usr/bin/env python3

# https://anti-spiegel.ru/2026/das-wall-street-journal-nennt-drei-moegliche-szenarien-fuer-ein-kriegsende/

import aiohttp
import asyncio
import bs4
import pickle
import os.path
import sys
import urllib.parse

MAX_MESSAGE_TEXT_LENGTH = 3980

headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:147.0) Gecko/20100101 Firefox/147.0",
}

async def init_cookies():
	global cookies
	cookies = aiohttp.CookieJar()
	# cookies.save( filename )  cookies.load( filename )

async def download_url( url ):

	print( "download_url:", url )
	async with aiohttp.ClientSession( cookie_jar=cookies, headers=headers ) as session:

		async with session.get( url ) as response:
			html = await response.text()
			return ( response.status, response.headers, html, url )

def make_cache_filename( url, cachedir ):
	for prefix in (
		"https://",
		"http://",
		"www.",
		"anti-spiegel.ru/"
	):
		if url.startswith( prefix ):
			url = url[len(prefix):]
	return os.path.join( cachedir, urllib.parse.quote_plus( url ) + ".pickle" )

async def load( url, cachedir ):
	filename = make_cache_filename( url, cachedir )
	if os.path.isfile( filename ):
		print( "load cache: %s" % filename )
		with open( filename, "rb" ) as ifd:
			webpage = pickle.load( ifd )
	else:
		print( "download: %s" % url )
		webpage = await download_url( url )
		with open( filename, "wb" ) as ofd:
			pickle.dump( webpage, ofd )
	return webpage

def get_tag_text( tag ):
	text = ""
	for kid in tag.descendants:
		if type(kid) is bs4.element.NavigableString:
			if text != "":
				text += " "
			text += str(kid).strip()
		else:
			pass
	return text

def append_tag_texts( data, text, tag ):
	text = ""
	for kid in tag.descendants:
		if type(kid) is bs4.element.NavigableString:
			if text != "":
				text += " "
			text += str(kid).strip()
		else:
			pass
	return text

def append_text( data, text, appendtext ):
	if len(text) + len(appendtext) >= MAX_MESSAGE_TEXT_LENGTH:
		data.append( text )
		text = appendtext
	else:
		text += appendtext
	return text

def filter_html( html, url ):
	soup = bs4.BeautifulSoup( html, 'html.parser' )
	# <div class="article__content">
	article = soup.find( "div", "article__content" )
	data = []
	text = "URL: %s\n\n" % url
	for tag in article.contents:
		if type(tag) is bs4.element.NavigableString:
			pass
		elif tag.name == "h1":
			# title
			tagtext = get_tag_text( tag )
			tagtext = "\n**%s**\n\n" % tagtext
			text = append_text( data, text, tagtext )
		elif tag.name == "span":
			# some random text
			tagtext = get_tag_text( tag ) + "\n"
			text = append_text( data, text, tagtext )
		elif tag.name == "div" and len(tag.attrs) == 0:
			# probably just subtitle
			tagtext = get_tag_text( tag ) + "\n\n"
			text = append_text( data, text, tagtext )
		elif tag.name == "div" and "hero" in tag.attrs["class"]:
			# from, date, time
			tagtext = get_tag_text( tag )
			tagtext = tagtext.replace( "\n", " " )
			tagtext = tagtext.replace( "\t", " " )
			tagtext = tagtext.replace( "    ", " " )
			tagtext = tagtext.replace( "    ", " " )
			tagtext = tagtext.replace( "  ", " " )
			tagtext = tagtext.replace( "  ", " " ).strip() + "\n\n"
			text = append_text( data, text, tagtext )
		elif tag.name == "div" and "article__content" in tag.attrs["class"]:
			# content
			for subtag in tag.contents:
				if type(subtag) is bs4.element.NavigableString:
					# probably can ignore random text
					pass
				elif subtag.name == "p":
					tagtext = get_tag_text( subtag ) + "\n\n"
					text = append_text( data, text, tagtext )
				elif subtag.name == "h3":
					tagtext = "\n" + get_tag_text( subtag ) + "\n\n"
					text = append_text( data, text, tagtext )
				elif subtag.name == "div" and "shariff" in tag.attrs["class"]:
					pass
				else:
					print( "***", str(subtag)[:240] )
		elif tag.name == "a" and "page-link" in tag.attrs["class"]:
			# link to next article, ignore
			pass
		elif tag.name == "hr":
			# ignore
			pass
		elif tag.name == "img":
			# picture
			if text != "":
				data.append( text )
				text = ""
			data.append( ( "img", tag.attrs["src"] ) )
		else:
			# tagtext = get_tag_text( tag )
			print( "---------" )
			print( tag.name )
			print( tag.attrs )
	if text != "":
		data.append( text )
	print( "=========" )
	return data

class URLs:

	def __init__( self, baseurl, urlfile ):
		self.baseurl = baseurl
		self.urlfile = urlfile
		self.urls_done = {}
		self.urls_new = {}
		self.ofp = None
		self.read_file()

	def get_base_url( self ):
		return self.baseurl

	def read_file( self ):
		if os.path.isfile( self.urlfile ):
			self.urls_done = {}
			self.urls_new = {}
			with open( self.urlfile, "r" ) as ifp:
				for line in ifp:
					self.urls_done[ line.strip() ] = True
		self.ofp = open( self.urlfile, "a+" )

	def add_new( self, url ):
		if not url in self.urls_done:
			if not url in self.urls_new:
				self.urls_new[url] = True
				print( "added:", url )

	def has_new_url( self ):
		return len(self.urls_new) > 0

	def next_new_url( self ):
		return next( iter( self.urls_new.keys() ) )

	def url_done( self, url ):
		self.urls_done[url] = True
		self.ofp.write( url + "\n" )
		del self.urls_new[url]
		print( "url_done:", url )

	def flush( self ):
		self.ofp.flush()

def filter_urls( html, urls ):
	soup = bs4.BeautifulSoup( html, 'html.parser' )
	for a in soup.find_all( 'a' ):
		if 'href' in a.attrs:
			url = a.attrs['href']
			if url.startswith( 'https://anti-spiegel.ru/20' ):
				urls.add_new( url )

def html2pickle( filename, cachedir, url ):
	with open( "test.html", "r" ) as ifd:
		html = ifd.read( 999999999 )
		webpage = ( 200, [], html, url )
		with open( filename, "wb" ) as ofd:
			pickle.dump( webpage, ofd )

async def check_for_new_urls( urls ):
	webpage = await download_url( urls.get_base_url() )
	print( "check_for_new_urls:", webpage[0] )
	if webpage[0] == 200:
		filter_urls( webpage[2], urls )

async def test_process_page( cachedir ):
	url = "https://anti-spiegel.ru/2026/das-wall-street-journal-nennt-drei-moegliche-szenarien-fuer-ein-kriegsende/"
	filename = make_cache_filename( url, cachedir )
	print( filename )
	# html2pickle( filename, cachedir, url )
	webpage = await load( url, cachedir )
	if webpage[0] == 200:
		data = filter_html( webpage[2], webpage[3] )
		#print( "data:", data )
		for line in data:
			print( "------ %d ------" % len(line) )
			print( line )
	else:
		print( "response code:", webpage[0] )

async def main():
	baseurl = "https://anti-spiegel.ru/"
	cachedir = "cache"
	urlfile = "urls.txt"
	urls = URLs( baseurl, urlfile )
	await test_process_page( cachedir )
	# await check_for_new_urls( urls )

if __name__ == "__main__":
	asyncio.run(main())

