User:Cedars/gaauto.pl

The following Perl script is a hack that automatically creates a categorized list of good articles in the same format as the good articles page. The script understands quoted, italicized and disguised article links. It uses the existing list as a basis for the new list. It removes old good articles from the revised list and offers the user the opportunity to categorize new good articles. It sorts and counts every article and can automatically adapt to use new headings and subheadings. It allows dual listings and major headings. The script uses cURL to download existing content and the Roman 1.1 Perl module to sort Final Fantasy titles. The script is designed to assist the human editing of Wikipedia articles, not replace it. It is best that users still add and remove articles from the list as they would without the script - this is because they are likely to categorize the items better than the script user. Please feel free to make changes to this page if you feel they would improve the script. If you have comments on the script please feel free to post them on the talk page.

A brief note on output

The script downloads several files to the working directory and outputs two files. The first file, "output_headings.txt", is a file listing the levels and sublevels available for catgorization. This file is output before any requests for categorization are made. The second file, "output.txt", is the formatted wiki-syntax for the list. It may appear corrupt if not opened using UTF-8. The script also outputs a timestamp list of when the most recently added articles were added, "stamp.time", and a backup of the previous version of that list, "stamp.bac". If the timestamp list and backup are dramatically different from each other the script will refuse to run. This is to prevent the timestamp list from becoming distorted and thus damaging the recently added list.

To view script properly use edit mode

#! /usr/bin/perl

use Roman;
use open ':utf8';

# Download a fresh copy of files
$DOWNLOAD = 1;

# Warn of removed articles
$REMOVED = 1;

# Number of new articles to remember
$NEWARTICLES = 14;

# Should open web browser or text editor
$ADVANCED = 0;

# Adds section comments (improves editing)
$SECTIONCOMMENTS = 0;

# Web broswer and text editor commands
$WEBBROWSER = "open";
$TEXTEDITOR = "open";

# Sorts article titles
sub titlesort { return titlecmp($a, $b); }
sub titlecmp {

	# Grab name
	%xh = %{shift()};
	$x = $xh{"name"};
	%yh = %{shift()};
	$y = $yh{"name"};
	
	if ($x =~ /Final Fantasy [X|V|I]+/ && $y =~ /Final Fantasy [X|V|I]+/) {
		
		# Handle Final Fantasy titles
		$x =~ /Final Fantasy ([X|V|I]+)/;
		$x = arabic($1);
		$y =~ /Final Fantasy ([X|V|I]+)/;
		$y = arabic($1);
		return $x <=> $y;
	
	}
	else {
		
		# Handle other titles
		$x =~ s/~~.*//g;
		$x =~ s/''//g;
		$y =~ s/~~.*//g;
		$y =~ s/''//g;
		if ($x =~ /.*\|(.*)/) {
			$x = $1;
		}
		if ($y =~ /.*\|(.*)/) {
			$y = $1;
		}
		return uc($x) cmp uc($y);
		
	}
}

# Sorts article names
sub basicsort { return basiccmp($a, $b); }
sub basiccmp {
	%xh = %{shift()};
	$x = $xh{"name"};
	%yh = %{shift()};
	$y = $yh{"name"};
	$x =~ s/~~.*//g;
	$x =~ s/''//g;
	$y =~ s/~~.*//g;
	$y =~ s/''//g;
	if ($x =~ /(.*)\|.*/) {
		$x = $1;
	}
	if ($y =~ /(.*)\|.*/) {
		$y = $1;
	}
	return uc($x) cmp uc($y);	
}

# Sorts article time stamps
sub timesort { return timecmp($a, $b); }
sub timecmp {
	%xh = %{shift()};
	$x = $xh{"time"};
	%yh = %{shift()};
	$y = $yh{"time"};
	if ($x < 0 && $y < 0) {
		return 0;
	}
	elsif ($x < 0 && $y >= 0) {
		return 1;
	}
	elsif ($x >= 0 && $y < 0) {
		return -1;
	}
	else {
		return ($x <=> $y) * -1;
	}
}

# Keep backup of timestamp file
if (-f "stamp.bac") {
	$stamp_size = -s "stamp.time";
	$stamp_bac_size = -s "stamp.bac";
	if (abs($stamp_size - $stamp_bac_size) > 1024) {
		print "Large change in timestamp file. This script will now quit to prevent data loss.\n";
		print "Please delete the \"stamp.bac\" file to continue.\n";
		exit(1);
	}
}
system "cp stamp.time stamp.bac";

# Download the current good articles file
if ($DOWNLOAD) {
	system "curl \"http://en.wikipedia.org/w/index.php?title=Wikipedia:Good_articles&action=edit\" > input_ga.html";
}

# Read the good articles file
open(FILE, "input_ga.html");
@input = <FILE>;
close(FILE);
$input_len = $#input + 1;

# Go through each line of the good articles file
$major = -1;
$level = -1;
$sublevel = 0;
$headings_len = 0;
$articles_len = 0;
$preamble_len = 0;
$preamble_on = 0;
$main_on = 0;
$lang_len = 0;
for ($i = 0; $i < $input_len; $i++) {

	# Get the current line
	$curline = $input[$i];
	$curline =~ s/&/&/g;
	$curline =~ s/</</g;
	$curline =~ s/>/>/g;
	$curline =~ s/"/\"/g;
	
	# Handle preamble
	if ($preamble_on) {
		if ($curline =~ /Gapages/) {
			$preamble_on = 0;
			$main_on = 1;
		}
		if ($preamble_len == 0) {
			$curline =~ s/.*>//;
		}
		$preamble[$preamble_len] = $curline;
		$preamble_len++;
	}
	elsif ($main_on) {
	
		# If it is a language remember it
		if ($curline =~ /\[\[[^W][^P]\:[^\]]*\]\]/) {
			$lang[$lang_len] = $curline;
			$lang_len++;
		}
	
		# If it is a recently added article image remember it
		if ($curline =~ /colspan=2.*\[\[Image:(.*)\]\]/) {
			$new_articles_image = $1;
		}
		
		# If it is a major heading add it to the major headings
		if ($curline =~ /<div style="padding:[^>]*>([^<]*)<\/div>/) {
			$major += 1;
			$realpart = $1;
			$imagpart = $1;
			$realpart =~ s/\[\[.*\]\]//;
			$realpart =~ s/'''//g;
			$imagpart =~ s/[^\]]*$//;
			$major_text[$major] = $realpart;
			$major_icon[$major] = $imagpart;
		}
		
		# If it is a heading add it to the headings
		if ($curline =~ /<div class="NavHead"[^>]*>([^<]*)<\/div>/) {
			$level += 1;
			$sublevel = 0;
			$headings_len += 1;
			$subheadings_len[$level] = 0;
			$sound = 1;
			$realpart = $1;
			$imagpart = $1;
			$realpart =~ s/\[\[.*\]\]//;
			$imagpart =~ s/[^\]]*$//;
			$headings[$level][$sublevel] = $realpart;
			$headings_icon[$level] = $imagpart;
			$headings_major[$level] = $major;
		}
		
		# If it is a subheading add it to the headings and start counting articles
		if ($curline =~ /=====(.*)=====$/) {
			$sublevel += 1;
			$subheadings_len[$level] += 1;
			$headings[$level][$sublevel] = $1;
			$start = 1;
		}
		
		# If it is an div stop counting articles
		if ($curline =~ /\/div/) {
			$start = 0;
		}
		
		# If it is an article add it to the articles list
		if ($start && $curline =~ /\[\[[^\]]*\]\]/) {	
			$searchstr = $curline;
			$searchstr =~ s/.*\[\[([^\]]*)\]\].*\n$/\1/;
			if ($curline =~ /.*\[\[[^\]]*\]\].*<!--.*-->.*\n$/) {
				$commentstr = $curline;
				$commentstr =~ s/.*\[\[[^\]]*\]\].*<!--\ *(.*)\ *-->.*\n$/\1/;
				$commentstr =~ s/\ +$//;
				$articles[$articles_len]{"comment"} = $commentstr;
			}
			if ($curline =~ /^\ *\'\'/) { $articles[$articles_len]{"italic"} = 1; }
			else { $articles[$articles_len]{"italic"} = 0; }
			if ($curline =~ /^\ *&quot/ || $curline =~ /^\ *\"/) { $articles[$articles_len]{"quote"} = 1; }
			else { $articles[$articles_len]{"quote"} = 0; }
			$articles[$articles_len]{"name"} = $searchstr;
			$articles[$articles_len]{"level"} = $level;
			$articles[$articles_len]{"sublevel"} = $sublevel;
			$articles[$articles_len]{"verified"} = 0;
			$articles[$articles_len]{"multi"} = 0;
			$articles[$articles_len]{"time"} = time();
			$articles_len += 1;
		}
	
	}
	else {
		if ($curline =~ /textarea/) {
			$preamble_on = 1;
		}
	}
		
}

# Check download worked
if ($articles_len == 0) {
	print "Download of good article list failed.\n";
	exit(1);
}

# Sort the articles list
@articles = sort basicsort @articles;

# Check for multiple entries
$narticles[0] = $articles[0];
$narticles_len = 1;
for ($i = 1; $i < $articles_len; $i++) {
	if (basiccmp($articles[$i], $articles[$i - 1]) == 0) {
		$narticles[$narticles_len - 1]{"multi"} = 1;
		$narticles[$narticles_len - 1]{"sec_level"} = $articles[$i]{"level"};
		$narticles[$narticles_len - 1]{"sec_sublevel"} = $articles[$i]{"sublevel"};
	}
	else {
		$narticles[$narticles_len] = $articles[$i];
		$narticles_len++;
	}
}
@articles = @narticles;
$articles_len = $narticles_len;

# Go through each of the category files
$cat_articles_len = 0;
$next = "http://en.wikipedia.org/wiki/Category:Wikipedia_good_articles";
for ($i = 1; $next != -1; $i++) {

	# Download the category file
	if ($DOWNLOAD) {
		system "curl \"$next\" > input_cat$i.html";
	}
	
	# Read the category file
	undef @input;
	open(FILE, "input_cat$i.html");
	@input = <FILE>;
	close(FILE);
	$input_len = $#input + 1;
	$next = -1;
	
	# Go through each line of the category file
	for ($j = 0; $j < $input_len; $j++) {
	
		# Get the current line
		$curline = $input[$j];
		$curline =~ s/&/&/g;
	
		# If it is an article add it to the category articles list
		do {
			$run = 0;
			if ($curline =~ />Talk:([^<]*)</) {
				$cat_articles[$cat_articles_len]{"name"} = $1;
				$cat_articles_len += 1;
				$run = 1;
				$curline =~ s/>Talk:([^<]*)<//;
			}
		} while ($run);
	
		# Find the next category file
		if ($curline =~ /<a.*href=\"([^\"]*)\"[^>]*>next 200/) {
			$next = "http://en.wikipedia.org".$1;
		}
	
	}

}

# Check download worked
if ($cat_articles_len == 0) {
	print "Download of good article category failed.\n";
	exit(1);
}

# Print the headings to file
open(FILE, ">output_headings.txt");
for ($i = 0; $i < $headings_len; $i++) {
	for ($j = 0; $j < $subheadings_len[$i] + 1; $j++) {
		if ($j == 0) {
			print FILE $i.".0    ".$headings[$i][$j]."\n";
		}
		else {
			print FILE "  ".$i.".".$j."  ".$headings[$i][$j]."\n";
		}
	}
}
close(FILE);

# Sort category articles list
@cat_articles = sort basicsort @cat_articles;
$orig = 0;

# Go through each of the category articles
for ($j = 0; $j < $cat_articles_len; $j++)  {

	# Search the articles list for the current category article
	$found_index = -1;
	if (basiccmp($articles[$orig], $cat_articles[$j]) == 0) {
		$found_index = $orig;
		$orig = ($orig + 1) % $articles_len;
	}
	else {
		for ($i = $orig + 1; $i != $orig && $found_index == -1; $i = ($i + 1) % $articles_len) {
			if (basiccmp($articles[$i], $cat_articles[$j]) == 0) {
				$found_index = $i;
				$orig = $i + 1;
			}
		}
	}
		
	# If an article is found mark it verified otherwise add a new article to the list
	if ($found_index != -1) {
		$articles[$found_index]{"verified"} = 1;
		$name_lower = 0;
		if (substr($articles[$found_index]{"name"}, 0, 1) ne substr($cat_articles[$j]{"name"}, 0, 1)) {
			$name_lower = 1;
		}
		$articles[$found_index]{"name"} =~ s/[^|]*/$cat_articles[$j]{"name"}/;
		if ($name_lower) {
			$articles[$found_index]{"name"} = lcfirst($articles[$found_index]{"name"});
		}
	}
	else {
		$articles[$articles_len]{"name"} = $cat_articles[$j]{"name"};
		print "Article not found: ".$cat_articles[$j]{"name"}."\n";
		$done = 0;
		do {
			print "Which level do you what to assign it to? (t for list, n for ignore)\n";
			$in = <STDIN>;
			chomp($in);
			$in = lc($in);
			if ($in eq "w") {
				if ($ADVANCED) {
					open(FILE, "output_headings.txt");
					@input = <FILE>;
					foreach $line (@input) { print $line; }
					close(FILE);
					$artname = $cat_articles[$j]{"name"};
					$artname =~ s/\"//g;
					$artname =~ s/ /_/g;
					`$WEBBROWSER "http://en.wikipedia.org/w/index.php?title=$artname"`;
				}
			}
			elsif ($in eq "t") {
				open(FILE, "output_headings.txt");
				@input = <FILE>;
				foreach $line (@input) { print $line; }
				close(FILE);
			}
			elsif ($in eq "exit" || $in eq "q") {
				exit(1);
			}
			elsif ($in eq "n") {
				$done = 1;
			}
			else {
				$articles[$articles_len]{"level"} = $in;
				$done = 1;
			}
		} while (!$done);
		if (!($in eq "n")) {
			print "Which sublevel do you what to assign it to?\n";
			$articles[$articles_len]{"sublevel"} = <STDIN>;
			$articles[$articles_len]{"verified"} = 1;
			$articles[$articles_len]{"multi"} = 0;
			$articles[$articles_len]{"time"} = time();
			$articles_len++;
		}
	}
	
}

# Open the time stamps
open(FILE, "stamp.time");
@input = <FILE>;
close(FILE);
$input_len = $#input + 1;
$orig = 0;
for ($i = 0; $i < $input_len; $i++) {

	# Get the current line
	$curline = $input[$i];
	$curline =~ s/&/&/g;
	$curline =~ s/</</g;
	$curline =~ s/>/>/g;
	$curline =~ s/"/\"/g;
	
	# Fill out the stamp
	$curline =~ s/\[\[(.*)\]\]//;
	$stamp[0]{"name"} = $1;
	$stamp[0]{"time"} = int($curline);

	# Search the articles list for a match
	$found_index = -1;
	if (basiccmp($articles[$orig], $stamp[0]) == 0) {
		$found_index = $orig;
		$orig = ($orig + 1) % $articles_len;
	}
	else {
		for ($j = $orig + 1; $j != $orig && $found_index == -1; $j = ($j + 1) % $articles_len) {
			if (basiccmp($articles[$j], $stamp[0]) == 0) {
				$found_index = $j;
				$orig = ($j + 1) % $articles_len;
			}
		}
	}
	
	# Assign the time stamp
	if ($found_index != -1) {
		$articles[$found_index]{"time"} = $stamp[0]{"time"};
	}
}

# Find the new articles
open(FILE, ">stamp.time");
$new_articles_count = 0;
@articles = sort timesort @articles;
for ($i = 0; $i < $articles_len; $i++) {
	if ($articles[$i]{"verified"}) {
		if ($new_articles_count < $NEWARTICLES && $articles[$i]{"time"} != -1) {
			$new_articles[$new_articles_count] = $articles[$i];
			$new_articles_count++;
		}
		else {
			$articles[$i]{"time"} = -1;
		}
		print FILE "[[".$articles[$i]{"name"}."]] ".$articles[$i]{"time"}."\n";
	}
}
close(FILE);
@new_articles = sort titlesort @new_articles;

# Sort the articles again
@articles = sort basicsort @articles;

# Open the output file
open(FILE, ">output.txt");

# Print out preamble
for ($i = 0; $i < $preamble_len; $i++) {
	print FILE $preamble[$i];
}

# Print the recently added articles
print FILE "|-\n| colspan=2 width=\"100%\" style=\"padding:1em 1em 1em 1em; border:1px solid #dfdfdf; background-color:#E0EDFA\"  valign=\"top\" align=\"center\"|";
if ($new_articles_image) {
	print FILE "[[Image:".$new_articles_image."]]";
}
print FILE "\n'''Recently listed good articles'''\n\n";
$pre = 0;
for ($i = 0; $i < $new_articles_count; $i++) {
	if ($pre) { print FILE " —\n"; }
	if ($new_articles[$i]{"quote"}) { print FILE ""[[".$new_articles[$i]{"name"}."]]""; }
	elsif ($new_articles[$i]{"italic"}) { print FILE "''[[".$new_articles[$i]{"name"}."]]''"; }
	else { print FILE "[[".$new_articles[$i]{"name"}."]]"; }
	$pre = 1;
}
print FILE "\n|}\n\n__NOTOC__\n";
print FILE "<div style=\"clear:both;\">\n";
print FILE "<!-- DO NOT REMOVE THIS DIV, USED TO FORCE IE TO DISPLAY BACKGROUND FOR ARTS DIV -->\n";
print FILE "</div>\n";

# Go through each heading and subheading
$article_count = 0;
$major = -1;
for ($i = 0; $i < $headings_len; $i++) {

	# Print out major heading
	if ($headings_major[$i] > $major) {
		$major = $headings_major[$i];
		if ($major > 0) {
			print FILE "</div>\n</div>\n";
		}
		print FILE "<div style=\"clear:both;\">\n";
		print FILE "<span id=\"$major_text[$major]\" />\n";
		print FILE "<div style=\"padding:5px 5px 8px 5px; background-color:#CCCCFF; text-align:left; font-size:larger;\">$major_icon[$major]'''$major_text[$major]'''</div>\n";
		print FILE "<div style=\"text-align:left;\">\n";
	}

	for ($j = 0; $j < $subheadings_len[$i] + 1; $j++) {
	
		# Write the heading or subheading
		if ($j == 0) {
			if ($i != 0) {
				print FILE "</div>\n";
				print FILE "</div>\n";
				print FILE "\n";
			}
			print FILE "<div style=\"clear:both;\" class=\"NavFrame\">\n";
			print FILE "<div class=\"NavHead\" style=\"padding:2px 2px 2px 30px; background-color:#FFFAF0; text-align:left; font-size:larger;\">$headings_icon[$i]$headings[$i][$j]</div>\n";
			print FILE "<div class=\"NavContent\" style=\"text-align:left;\">\n";
			if ($SECTIONCOMMENTS) {
				print FILE "==<!--$headings[$i][$j]--> ==\n";
			}
			else {
				print FILE "== ==\n";
			}
		}
		else {
			print FILE "\n=====".$headings[$i][$j]."=====\n";
		}
		
		# Run through the articles adding them if they belong to the current level
		undef @cur_articles;
		$cur_articles_len = 0;
		$article_count = 0;
		for ($k = 0; $k < $articles_len; $k++) {
			if ($articles[$k]{"level"} == $i && $articles[$k]{"sublevel"} == $j) {
				if ($articles[$k]{"verified"}) {
					$cur_articles[$article_count] = $articles[$k];
					$article_count++;
					$total_count++;
				}
				else {
					if ($REMOVED) {
						print "REMOVED ARTICLE: ".$articles[$k]{"name"}."\n";
					}
				}
			}
			elsif ($articles[$k]{"multi"} == 1 && $articles[$k]{"sec_level"} == $i && $articles[$k]{"sec_sublevel"} == $j) {
				if ($articles[$k]{"verified"}) {
					$cur_articles[$article_count] = $articles[$k];
					$article_count++;
				}
			}
		}
		
		# Then sort and print the articles
		if ($article_count > 0) {
			@cur_articles = sort titlesort @cur_articles;
			$pre = 0;
			for ($k = 0; $k < $article_count; $k++) {
				if ($pre) { print FILE " —\n"; }
				if ($cur_articles[$k]{"quote"}) { print FILE ""[[".$cur_articles[$k]{"name"}."]]""; }
				elsif ($cur_articles[$k]{"italic"}) { print FILE "''[[".$cur_articles[$k]{"name"}."]]''"; }
				else { print FILE "[[".$cur_articles[$k]{"name"}."]]"; }
				if ($cur_articles[$k]{"comment"}) { print FILE " <!-- ".$cur_articles[$k]{"comment"}." -->"; }
				$pre = 1;
			}
			if ($article_count == 1) {
				print FILE "\n<small>\x{2014} (1 article)</small>\n";
			}
			else {
				print FILE "\n<small>\x{2014} (".$article_count." articles)</small>\n";
			}
		}
		
	}
}

# Close the output file
print FILE "</div>\n";
print FILE "</div>\n\n";
for ($i = 0; $i < $lang_len; $i++) {
	print FILE $lang[$i];
}
print FILE "\n";
print FILE "[[Category:Wikipedia good articles|  ]]\n";
close(FILE);

# Reopen the output file and reprint with correct number of articles
open(FILE, "output.txt");
@input = <FILE>;
close(FILE);
$input_len = $#input + 1;
open(FILE, ">output.txt");
for ($i = 0; $i < $input_len; $i++) {
	$input[$i] =~ s/\[\[Wikipedia\:Good articles\/Statistics\|[0-9]*\]\]/\[\[Wikipedia\:Good articles\/Statistics\|$total_count\]\]/;
	$input[$i] =~ s/expr: \{\{NUMBEROFARTICLES\:R\}\} \/ [0-9]*/expr: \{\{NUMBEROFARTICLES\:R\}\} \/ $total_count/;
	print FILE $input[$i];
}
close(FILE);

# Print out total number of articles
print "Number of articles: ".$total_count."\n";

# Open for editing
if ($ADVANCED) {
	print "Do you want me to open your browser for editing? (y/n)\n";
	$in = <STDIN>;
	chomp($in);
	$in = lc($in);
	if ($in eq "y") {
		`$WEBBROWSER "http://en.wikipedia.org/w/index.php?title=Wikipedia:Good_articles&action=edit"`;
		`$TEXTEDITOR "output.txt"`;
	}
}