User:Neoconned/LocalRefConverter

From SourceWatch
Jump to navigation Jump to search

This is a crudely hacked version of http://en.wikipedia.org/wiki/User:Cyde/Ref_converter. It will convert an article using the note/ref template referencing system to using the new Cite.php references. Cyde Weys' original version is designed to run on a webserver, and to fetch articles directly from wikipedia. This version runs on your PC, fetches the article to convert from a local text file, and saves the converted article to another local text file. To use:

  • Install perl on your computer.
  • Unlike with the original, you don't need to install any Perl extensions from CPAN.
  • Save the code below into a file called wikirefs.txt in whichever directory Perl sees by default.
  • Put the article wiki source you want to convert into a file called convert_me.txt in that directory.
  • Run perl wikirefs.txt
  • The converted article should be a file called convertedFile.txt in that directory.
#!/usr/bin/perl
#
# "WikiRefs"
# This program converts {{note}} and {{ref}} to <references /> style on Wikipedia.
# Copyright (C) 2006 Ben "Cyde Weys" McIlwain
# Trivially modified by Neoconned (SourceWatch) to run locally, May 2007
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
#
#
#---------------------------------------------------
#         Script configuration options.
#
#Set to 1 to enable logging.  Set to 0 to disable logging.
 my $optLogging = 1;
#
#Specify the location to store the log (must be writable by your httpd process).
my $optLogLoc = '/var/log/apache2/refconv.log';
#
#---------------------------------------------------

sub writeToLog;

    #This accumulates the number of possible things that were incorrect with {{ref}}/{{note}}.
    my $numErrors = 0;
    
	print 'Getting Wiki source...';
$data_file="convert_me.txt";

{
local $/;
open(SLURPING, $data_file) || die("Could not open file!");
$responseContent=<SLURPING>;
} 

my $fullText = $responseContent;

print "\n\n";
print "OPENED FILE OK \n\n";

###############################################################################

	#This keeps track of the initial length of the article before we make any changes to it.
	my $preLength = length($fullText);

	#These two variables accumulate lines of text and are output at the end.
	my $warnings = "";
	my $verbosage = "";

	#Get rid of the "How to add a footnote" comment that this script makes superfluous.
	if ($fullText =~ m/\<\!\-\-[^\n]*add[^\n]*footnote.*?\-\-\>/s) {
	    $fullText =~ s/\<\!\-\-[^\n]*add[^\n]*footnote.*?\-\-\>//s;
	    $warnings .= "Deleting comment on how to add old footnotes, make sure this was done correctly.\n";
	}

	#{{mnb2}} is incredibly broken
	if ($fullText =~ m/\{\{mnb2/gi) {
	    $warnings .= "Panic, detecting {{mnb2}}, this article is most likely broken and will need manual repair.\n";
	}

	#This goes through the article source looking for citation templates that are over one line.  This is
	#necessary because the citation templates must be inserted into the article text inline or things will break.
	#This has the side-effect of changing citation templates that aren't part of notes.  Oh well.
	#Then we need to detect if any changes have been made, and if they have, print a warning message to that effect.
	my $tempText = $fullText;
	$fullText =~ s/(\{\{cite [^\{\}]*?\}\})/my$x=$1;$x=~s{\n}{}g; $x/egs;
	if ($tempText ne $fullText) {
	    $warnings .= "Detecting multiple line cite, trying to fix, make sure I don't make any mistakes.\n";
	}

	#Get a list of all matches of {{ref|...}} and {{ref label|...}} and {{ref harv|...}} and {{ref harvard|...}}

	my @matches = ($fullText =~ m/\{\{(?:mn|ref(?:[_ ]label|[_ ]harv|[_ ]harvard|[_ ]num)?)\s*\|\s*([^\|]*?)\s*(?:\|\s*[^\|\}]*?\s*)*?\}\}/gi);
	#push @matches, ($fullText =~ m/\{\{mn\s*\|\s*([^\|]*?)\s*\|\s*[^\|\}]*?\s*\}\}/gi);

	#If there are no {{ref}}s in the article then there's no point in continuing.
	if ($#matches > -1) {

	    ### This next little section creates @matchesSingle, which consists of @matches minus
	    # any duplicate entries, and @matchesMult, which consists of a list of single entries
	    # of things that did have duplicate entries.  It also removes duplicate entries from @matches.
	    # In other words, if @matches was [a,a,b,c,d,d,e], then:
	    # @matches = [a,b,c,d,e]
	    # @matchesSingle = [b,c,e]
	    # @matchesMult = [a,d]

	    my %tempHash;
	    my %multHash;
	    foreach (@matches) {
		#Note: lc turns all the characters of a string into their lowercase counterparts._
		if (exists $tempHash{lc($_)}) {
		    $multHash{lc($_)} = lc($_);
		}
		else {
		    $tempHash{lc($_)} = lc($_);
		}
	    }
	    @matches = sort values %tempHash;
	    my @matchesMult = sort values %multHash;

	    #Subtract set @matchesMult from set @matchesSingle
	    foreach (@matchesMult) {
		delete $tempHash{$_};
	    }
	    my @matchesSingle = sort values %tempHash;

	    #
	    # End complicated section.
	    ###

	    if ($#matchesMult >= 0) {
		$warnings .= "Detecting multiple refs with the same name, make sure I handle this correctly.\n";
	    }

	    #refCoors is the hash between ref name and note text.
	    my %refCorrs = ();
	    my $finalText = "";
	
	    my $firstMatch = 1;
	    my $matched = 0;

	    #Split the full Wiki source into discrete lines and process them sequentially to see if
	    #each line contains a {{note}} or a {{note label}}.  If the line does contain a {{note}},
	    #match it up in the hash with its appropriate ref.  If it doesn't match, throw a warning
	    #and comment it out.  If it did match, remove it, and replace all removed {{note}}s with a single <references />
	    foreach (split /\n/, $fullText) {
		my $thisLine = $_;
		$matched = 0;

		#Loop through each of the ref names to see if it matches with any notes on this line.  This has O(n*m) efficiency.
		foreach (@matches) {
		    if ($thisLine =~ m/\{\{(?:mnb2?|note(?:[_ ]label)?)\s*\|\s*\Q$_\E\s*(?:\|\s*[^\{\}]*?\s*)*\}\}\s*(.*)$/i) {
			my $thisMatch = $1;
			if ($thisMatch =~ m/(\{\{note[_ ]label[^\}\{]*?\}\})/i) {
			    $thisMatch =~ s/\{\{note[_ ]label\s*[^\}\{]*?\}\}//gi;
			}

			#Chop off leading and trailing spaces.
			$thisMatch =~ s/^\s+//;
			$thisMatch =~ s/\s+$//;
			$verbosage .= "Matching up ref \"$_\", removing from list, note is: $thisMatch\n";
			$refCorrs{$_} = $thisMatch;
			$matched = 1;

			#firstMatch is used to keep track of the first note that has been replaced.  The first note is replaced
			#with <references /> and the rest are just deleted.
			if ($firstMatch == 1) {
			    if ($fullText !~ m/\<references(\s*\/)?\>/g) {
				if ($smallFont eq "on") {
				    $finalText .= '<div class="references-small"><references /></div>' . "\n";
				}
				else {
				    $finalText .= "<references />\n";
				}
			    }
			    $firstMatch = 0;
			}
		    }
		}

		#If this line had a note with no corresponding ref, comment it out and print a warning message.
		if ($matched == 0) {
		    if ($thisLine =~ m/\{\{(?:mnb2?|note)\s*\|\s*([^\|]*?)\s*\|?\s*\}\}\s*(.*)$/i) {
			$warnings .= "Note \"$1\" isn\'t referenced, commenting out, link was: $2\n";
			$numErrors++;
			$finalText .= "<!-- Dead note \"$1\": $2 -->\n";
		    }
		    else {
			$finalText = $finalText . $thisLine . "\n";
		    }
		}
	    }

	    my $currMatch = "";

	    #Go through and replace references that were only referenced once with a simple <ref>.
	    foreach $currMatch (@matchesSingle) {
		if (exists $refCorrs{$currMatch} && $refCorrs{$currMatch} !~ m/^\s*$/) {
		    if ($forceNames eq 'on') {
			my $refName = $currMatch;
			if ($refName =~ m/^\d+$/) {
			    $refName = 'ref' . $refName;
			}
			$finalText =~ s/\{\{(?:mn|ref(?:[_ ]label|[_ ]harv|[_ ]harvard|[_ ]num)?)\s*\|\s*\Q$currMatch\E\s*(?:\|[^\|\}]*?\s*)*?\}\}/\<ref name=\"$refName\"\>$refCorrs{$currMatch}\<\/ref\>/gi;
		    }
		    else {
			$finalText =~ s/\{\{(?:mn|ref(?:[_ ]label|[_ ]harv|[_ ]harvard|[_ ]num)?)\s*\|\s*\Q$currMatch\E\s*(?:\|[^\|\}]*?\s*)*?\}\}/\<ref\>$refCorrs{$currMatch}\<\/ref\>/gi;
		    }
		    $verbosage .= "Replacing ref \"$currMatch\" with full note: \<ref\>$refCorrs{$currMatch}\<\/ref\>\n";
		}
		elsif (exists $refCorrs{$currMatch} && $currMatch =~ m/^\s*$/) {
		    #Deal with blank notes.  We don't want to be inserting <ref></ref> into the article.
		    $numErrors++;
		    $warnings .= "Found a blank note, ref is \"$currMatch\"\n";
		}
		else {
		    $numErrors++;
		    $warnings .= "Ref \"$currMatch\" doesn\'t exist in notes.  Turning into \{\{citation needed\}\}\n";
		}
	    }

	    #Now we need to go through and replace references that were referenced multiple times.
	    #We need to name our references now.
	    foreach $currMatch (@matchesMult) {
		if (exists $refCorrs{$currMatch} && $refCorrs{$currMatch} !~ m/^\s*$/) {
		    #Cite.php returns an error if the refName is an integer value, so we'll pad it out with a character.
		    my $refName = $currMatch;
		    if ($refName =~ m/^\d+$/) {
			$refName = 'ref' . $refName;
		    }
		    $finalText =~ s/\{\{(?:mn|ref(?:[_ ]label|[_ ]harv|[_ ]harvard|[_ ]num)?)\s*\|\s*\Q$currMatch\E\s*(?:\|[^\|\}]*?\s*)*?\}\}/\<ref name=\"$refName\"\>$refCorrs{$currMatch}\<\/ref\>/i;
		    $finalText =~ s/\{\{(?:mn|ref(?:[_ ]label|[_ ]harv|[_ ]harvard|[_ ]num)?)\s*\|\s*\Q$currMatch\E\s*(?:\|[^\|\}]*?\s*)*?\}\}/\<ref name=\"$refName\" \/\>/gi;
		    $verbosage .= "Replacing multiply referenced \"$refName\" with full notes: \<ref\>$refCorrs{$currMatch}\<\/ref\>\n";
		}
		elsif (exists $refCorrs{$currMatch} && $currMatch =~ m/^\s*$/) {
		    #Deal with blank notes.  We don't want to be inserting <ref></ref> into the article.
		    $numErrors++;
		    $warnings .= "Found a blank multiply referenced note, ref is \"$currMatch\"\n";
		}
		else {
		    $numErrors++;
		    $warnings .= "Multiple reference \"$currMatch\" doesn\'t exist in notes.  Turning into \{\{citation needed\}\}\n";
		}
	    }
	
	    #One more loop through any remaining {{ref}} tags to turn them into {{citation needed}}.
	    $finalText =~ s/\{\{(?:mn|ref(?:[_ ]label|[_ ]harv|[_ ]harvard|[_ ]num)?)\s*\|\s*[^\|]*?\s*(?:\|[^\|\}]*?\s*)*?\}\}/\{\{citation needed\}\}/gi;

	    #Remove excess spaces that we may have just made by deleting the content inbetween.
	    if ($finalText =~ m/\n{4,}/gs) {
		$warnings .= "I think I have found too many consecutive newlines, I am going to remove them, make sure I did this right.\n";
		$finalText =~ s/\n{4,}/\n\n/gs;
	    }

	    #Final sanity checks
	    if ($finalText =~ m/\{\{ref/gi) {
		$warnings .= "Failing sanity check, there may still be some {{ref}}s left.\n";
	    }
	    if ($finalText =~ m/\{\{note/gi) {
		$warnings .= "Failing sanity check, there may still be some {{note}}s left.\n";
	    }
	    if ($finalText =~ m/\{\{mn/gi) {
		$warnings .= "Failing sanity check, there may still be some Footnote4 stuff left ({{mn}} or {{mnb}}).\n";
	    }

	    print '<b>Finished</b>.<br>' . "\n";

##nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn
print '\n\n\n';
print 'WRITING OUTPUT FILE \n\n';

open OUT, "> convertedFile.txt" or die "Can't open $outfile : $!";
print OUT $finalText;
##nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn

	}
	else {
	}

#Writes log output to a file.
sub writeToLog {
}