User:GalliumBot/vandyke/vandyke.py

"""
Copyright (c) 2022 theleekycauldron

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
"""
import pywikibot as pwb
from pywikibot import pagegenerators
import re
import requests
import datetime
import random

threshold = [600,1000]
months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
site = pwb.Site("en","wikipedia")
tag = "[[[User:GalliumBot#vandyke|vandyke]] v2.3.2]"

class Article:
    def __init__(self,title,alts=None,views=0,vph=0,background=0,background_vph=0,error=False):
        self.title          = title
        self.obj            = pwb.Page(site,self.title)
        self.alts           = [] if alts is None else alts
        self.views          = views
        self.vph            = vph
        self.background     = background
        self.background_vph = background_vph
        self.error          = error
        
    def get_alts(self,timeslots):
        timeslots = [pwb.Timestamp.fromisoformat(timeslot.strftime("%Y-%m-%dT%H:%M:%S")) for timeslot in timeslots]
        for revision in self.obj.revisions(starttime=timeslots[1],endtime=timeslots[0]):
            comment = revision.comment.split(" ")
            if comment[1:3] == ["moved","page"] and comment[3][:2] == "[[":
                i = 3
                while comment[i][-2:] != "]]":
                    i += 1
                alt = " ".join(comment[3:i+1])[2:-2]
                if alt not in self.alts and alt != self.title:
                    self.alts.append(alt)
    
    def sanitize(self,title=None):
        if title is None:
            title = self.title
        replacer = {
            " ":      "_",
            "&nbsp;": "_",
            "/":      "%2F",
            "?":      "%3F"
        }
        
        # Create a regular expression  from the dictionary keys
        regex = re.compile("(%s)" % "|".join(map(re.escape, replacer.keys())))
        # For each match, look-up corresponding value in dictionary
        return regex.sub(lambda mo: replacer[mo.string[mo.start():mo.end()]], title)
        
    def get_views(self,title,dates,raw_date,time,jitter):
        jitterbug = f"?max-age={random.randint(1,1000)}" if jitter else ""
        url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/user/{self.sanitize(title=title)}/daily/{dates[0]}/{dates[1]}{jitterbug}"
        headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"}
        response = requests.get(url=url,headers=headers).json()
        try:
            viewsarr = [r["views"] for r in response["items"]]
            datesarr = [r["timestamp"] for r in response["items"]]
        except KeyError as e:
            self.error = True
            print(url,response,e)
            return
            
        date = datetime.datetime.strftime(raw_date,"%Y%m%d00")
        try:
            ind = datesarr.index(date)
            if ind < 2:
                viewsarr = [viewsarr[1-ind]]*(2-ind) + viewsarr #complicated bit of padding
                ind = 2
            elif ind == len(viewsarr)-1:
                viewsarr.append(viewsarr[ind-1])
        except Exception as e:
            self.error = True
            print(url,response,e)
            return
        
        self.background += (viewsarr[ind-1]+min(viewsarr[ind-2],viewsarr[ind+1]))/2
        self.views += viewsarr[ind] - self.background
        self.vph += 3600*self.views/time.total_seconds()
        self.background_vph += 3600*self.background/time.total_seconds()
        print(f"{self.title}: {self.vph}")

class Hook:
    def dates_of_interest(self):
        self.date = self.timeslots[0] + (self.timeslots[1]-self.timeslots[0])/2
        self.dft  = self.date + datetime.timedelta(days=1) if self.date.hour>=12 else self.date
        self.date = self.date.replace(hour=0,minute=0)
        self.dft  = self.dft.replace(hour=0,minute=0)
                
        if self.timeslots[0].day == self.timeslots[1].day: #start/end on the same day (12-hour pt. 1)
            self.time = self.timeslots[1] - self.timeslots[0]
        else: #return largest segment
            if self.timeslots[1] - self.dft > self.dft - self.timeslots[0]:
                self.time = self.timeslots[1] - self.dft
                self.timeslots[0] = self.dft
            else:
                self.time = self.dft - self.timeslots[0]
                self.timeslots[1] = self.dft
            
        return [self.date - datetime.timedelta(days=5),self.date + datetime.timedelta(days=3)]
    
    def get_views(self,jitter):
        dates = [datetime.datetime.strftime(date,"%Y%m%d00") for date in self.dates_of_interest()]
        for article in self.articles:
            article.get_views(article.title,dates,self.date,self.time,jitter)
            try:
                article.get_alts(self.timeslots)
            except pwb.exceptions.NoPageError as e:
                print(e)
                pass
            for alt in article.alts:
                article.get_views(alt,dates,self.date,self.time,jitter)
        
        self.total_views = sum(article.views for article in self.articles)
        self.total_vph = sum(article.vph for article in self.articles)
        self.total_background_vph = sum(article.background_vph for article in self.articles)
        self.stats = self.total_vph >= self.threshold
        if len(self.articles)>1:
            self.articles.sort(key=lambda x:x.vph,reverse=True)
        
    def notify(self):
        pages = list(pagegenerators.SearchPageGenerator(f'insource:"==DYK for {self.articles[0].title}=={{{{ivmbox |image = Updated DYK query.svg"',total=5,namespaces=["User talk"],site=site))
        for page in pages:
            if "/" in page.title():
                continue
            pagetext = page.text.splitlines()
            ind = pagetext.index(f"==DYK for {self.articles[0].title}==")
            if any(["{{DYK views" in line for line in pagetext[ind:ind+11]]):
                continue
            pagetext.insert(ind+6,f'{{{{DYK views|{round(self.total_views):,}|{round(self.total_vph,1):,}|{datetime.datetime.strftime(datetime.datetime.now(),"%B %Y")}|{self.articles[0].title}}}}} ~~~~')
            page.text = "\n".join(pagetext)
            page.save(summary=f"/* DYK for {self.articles[0].title} */ your hook reached {round(self.total_views):,} views! {tag}",botflag=True)
            
    def use_background(self,i): #unpythonic, but easy to fiddle with
        if self.articles[i].background >= 1000:
            return True
        
        if self.articles[i].views < 0:
            return True
        
        if self.total_vph<self.threshold and self.total_vph+self.total_background_vph>=self.threshold and i==0:
            return True
        
        return False
    
    def __repr__(self):
        res = ""
        for i in range(len(self.articles)):
            article = self.articles[i]
            total = ""
            alts = ""
            if len(article.alts) == 1:
                alts = f"|alts=[[{article.alts[0]}]]"
            elif len(article.alts) == 2:
                alts = f"|alts=[[{article.alts[0]}]] and [[{article.alts[1]}]]"
            elif len(article.alts) > 2:
                alts = ", ".join(f"[[{alt}]]" for alt in article.alts)
                alts = "|alts="+alts[:-(4+len(article.alts[-1]))]+"and "+alts[-(4+len(article.alts[-1])):]
                
            if i>0:
                head = "{{DYK stats table multi"
                if i == len(self.articles)-1:
                    total = f"\n{{{{DYK stats table multi total|{round(self.total_views):,}|{round(self.total_vph,1):,}}}}}"
                image = ""
            else:
                if len(self.articles)>1:
                    head = f"{{{{DYK stats table multi begin"
                else:
                    head = "{{DYK stats table row"
                image = '|' + self.image
                
            date = datetime.datetime.strftime(self.date,"%Y-%m-%d")
            background = (f"|b={article.background:,}" if self.use_background(i) else "") if not article.error else f"|error=y"
            articlecount = f"|{len(self.articles):,}" if head == '{{DYK stats table multi begin' else ''
            hooktext = self.text if head != '{{DYK stats table multi' else ''
            res += f"{head}|{article.title}{articlecount}{image}|{date}|{round(article.views):,}|{round(article.vph,1):,}|{hooktext}{background}{alts}}}}}{total}\n"
        return res
    
    def extract_articles(self):
        text = re.findall(r"'''(.+?)'''",self.text)
        text = [(expand_templates(a) if "{{" in a else a) for a in text]
        self.articles  = [a[0].capitalize() + a[1:] for a in re.findall(r"\[\[(?!Category:)([^\|\]#]+)"," ".join(text))] # standard extraction
        self.articles += [a[0].capitalize() + a[1:] for a in re.findall(r"\[\[([^\|\]#]+)(?:\||\]\]|#)'''",self.text)] # missing entires because y'all CAN'T FORMAT SOMETIMES
        
        if len(self.articles)>1:
            self.articles = list(set(self.articles)) # rm duplicates
        
        self.articles = [Article(article) for article in self.articles]
            
    def __init__(self,text,timeslots,image,jitter):
        self.text      = text                      # "... that '''[[leek]]s''' are objectively the best vegetable, as opposed to '''[[carrot]]s'''?" 
        self.timeslots = timeslots                 # [datetime.datetime(2020,7,29,hour=0,minute=0),datetime.datetime(2020,7,29,hour=12,minute=0)]
        self.image     = image.replace("File:","") # "Leek.jpg" or ""
        self.threshold = threshold[1] if self.image else threshold[0] #creates self threshold for background
        self.extract_articles()                    # ["Leek", "Carrot"]
        self.get_views(jitter)                     # {"Leek": 10253, "Carrot": 231}



def expand_templates(text):
    s = requests.session()
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "expandtemplates",
        "text": text,
        "prop": "wikitext",
        "format": "json"
    }

    r = s.get(url=url, params=params)
    data = r.json()
    return data["expandtemplates"]["wikitext"].replace("&#32;"," ")

def generate_wikitext(archivepagename):
    archivepage = pwb.Page(site,archivepagename)
    wikitext = archivepage.text
    
    if archivepagename != "Wikipedia:Recent additions":
        monthyear = archivepagename.split("/")[1:]
        if monthyear[1] == "December":
            nextmonthyear = f"Wikipedia:Recent additions/{int(monthyear[0])+1}/January"
        else:
            nextmonthyear = f"Wikipedia:Recent additions/{monthyear[0]}/{months[months.index(monthyear[1])+1]}"
        nextarchivepage = pwb.Page(site,nextmonthyear)        
        if nextarchivepage.text[:9].lower() == "#redirect":
            nextarchivepage = pwb.Page(site,"Wikipedia:Recent additions")
        wikitext = nextarchivepage.text[nextarchivepage.text.rindex("*''''"):] + "\n" + wikitext
    
    return wikitext
      
def process_wikitext(wikitext,jitter):
    wikiarr = wikitext.splitlines()
    t1 = None
    t2 = None
    hooks = []
    output = []
    image = ""
    setnum = 0
    for line in wikiarr:
        if " (UTC)'''" in line: #timestamps
            t1 = t2
            t2 = datetime.datetime.strptime(line,"*'''''%H:%M, %d %B %Y (UTC)'''''")
            
            if t1 is None:
                continue
            
            print(f"==={t2} -> {t1}===")
            for i in range(len(hooks)):
                output.append(Hook(hooks[i],[t2,t1],image if i==0 else "",jitter and setnum<3))
            
            hooks = []
            image = ""
            setnum += 1
        
        elif "{{main page image" in line: #image
            line = re.split("\||{{!}}",line)
            try:
                image = line[1][line[1].index("=")+1:]
            except ValueError:
                image = line[1]
        
        elif "* ... " in line or "*..." in line: #hook
            line = line[line.index("..."):]
            hooks.append(line)
    output.sort(key = lambda x:x.total_vph, reverse=True)
    return output
    
def process_data(total,archivepagename):
    try:
        monthyearlist = archivepagename.split("/")[1:]
        monthyear = monthyearlist[1] + " " + monthyearlist[0]
        yeartarget = "/"+ monthyearlist[0]
        monthyeartarget = f"/{monthyearlist[0]}/{monthyearlist[1]}"
    except IndexError as e:
        monthyear = datetime.datetime.strftime(datetime.datetime.now(),"%B %Y")
        yeartarget = "/"+monthyear[monthyear.index(" ")+1:]
        monthyeartarget = "/"
        
    data = {
        "Total": total,
        "Imaged": list(filter(lambda hook:hook.image != "",total)),
        "Nonimaged": list(filter(lambda hook:hook.image == "",total))
    }
    
    def thresholdpass(d):
        return sum([a.stats for a in d])
    
    sections = {
        "Main": "==To main summary page==\n{{DYK stats monthly summary table|",
        "Total":          f"==To total table==\n<noinclude>This row is transcluded to [[Wikipedia:Did you know/Statistics/Monthly summary statistics{yeartarget}/Total]].\n{{|class=\"wikitable\"</noinclude>\n|-",
        "Imaged":        f"==To imaged table==\n<noinclude>This row is transcluded to [[Wikipedia:Did you know/Statistics/Monthly summary statistics{yeartarget}/Imaged]].\n{{|class=\"wikitable\"</noinclude>\n|-",
        "Nonimaged": f"==To non-imaged table==\n<noinclude>This row is transcluded to [[Wikipedia:Did you know/Statistics/Monthly summary statistics{yeartarget}/Non-imaged]].\n{{|class=\"wikitable\"</noinclude>\n|-"
    }
    
    def low(d):
        return (f"{round(d[-1].total_vph,1):,}",", ".join([f"[[{x.title}]]" for x in d[-1].articles]))
        
    def median(d):
        if len(d)%2==0:
            a = [len(d)//2,len(d)//2-1]
            return (f"{round((d[a[0]].total_vph+d[a[1]].total_vph)/2,1):,}","<br/>".join([", ".join([f"[[{x.title}]]" for x in d[n].articles]) for n in a]))
        else:
            a = (len(d)-1)//2
            return (f"{round(d[a].total_vph,1):,}",", ".join([f"[[{x.title}]]" for x in d[a].articles])) 
        
    def high(d):
        return (f"{round(d[0].total_vph,1):,}",", ".join([f"[[{x.title}]]" for x in d[0].articles]))
        
    funcs = {
        "Low": low,
        "Median": median,
        "High": high
    }
    
    for category in ["Total","Imaged","Nonimaged"]:
        sections[category] += f"\n|[[Wikipedia:Did you know/Statistics/Monthly DYK pageview leaders{monthyeartarget}|{monthyear}]]"
        tp = thresholdpass(data[category])
        lc = len(data[category])
        sections[category] += f"\n| {lc}"
        sections[category] += f"\n| {tp}"
        sections[category] += f"\n| {round(100*tp/lc,1):,}"
        
    for stat in ["Low","Median","High"]:
        temp = f"\n{{{{DYK stats monthly summary table row|{stat}"
        for category in ["Nonimaged","Imaged","Total"]:
            res = funcs[stat](data[category])
            sections[category] += f"\n| {res[0]}"
            sections[category] += f"\n| {res[1]}"
            temp += f"|{res[0]}|{res[1]}"
        sections["Main"] += temp + "}}"
            
    return f"""{sections["Main"]}
}}}}
{sections["Total"]}
<noinclude>|}}</noinclude>
{sections["Imaged"]}
<noinclude>|}}</noinclude>
{sections["Nonimaged"]}
<noinclude>|}}</noinclude>"""

def main(archivepagename="Wikipedia:Recent additions",jitter=True,edit=True,notify=None):
    if notify is None:
        notify = (archivepagename == "Wikipedia:Recent additions" and edit)
    wikitext = generate_wikitext(archivepagename) #Grab wikitext from the archive page (and the next archive page, if relevant)
    pageviews_data = process_wikitext(wikitext,jitter) #Process into a series of Hook objects
    table = f"""{{{{Wikipedia:Did you know/Statistics/Tabs|4}}}}
{{{{Wikipedia:Did you know/Statistics/Monthly DYK pageview leaders/Navigation}}}}
{{{{Excerpt|Wikipedia:Did you know/Statistics/Monthly DYK pageview leaders{archivepagename.replace("Wikipedia:Recent additions","")}/Summary|To main summary page|hat=no}}}}
{{{{clear}}}}
==Table==
{{{{DYK stats table|
{"".join([str(hook) for hook in pageviews_data])}}}}}""" #Write Hook objects into DYK stats table
    statspage = pwb.Page(site,archivepagename.replace("Wikipedia:Recent additions","Wikipedia:Did you know/Statistics/Monthly DYK pageview leaders"))
    if statspage.text is not table:
        statspage.text = table
        statspage.save(summary=f"feedin' the bangtail {tag}") #editing into page
    summary = process_data(pageviews_data,archivepagename) #Obtain summary data
    summarypage = pwb.Page(site,f'Wikipedia:Did you know/Statistics/Monthly DYK pageview leaders{archivepagename.replace("Wikipedia:Recent additions","")}/Summary')
    if summarypage.text is not summary:
        summarypage.text = summary
        summarypage.save(summary=f"feedin' the bangtail {tag}") #editing into page
    if notify:
        for hook in pageviews_data:
            if hook.stats:
                hook.notify() #notify nominator if past the threshold
                
if __name__ == "__main__":
    main()