Toolserver:~gregbard/test.cgi

From mediawiki.org

This page was moved from the Toolserver wiki.
Toolserver has been replaced by Toolforge. As such, the instructions here may no longer work, but may still be of historical interest.
Please help by updating examples, links, template links, etc. If a page is still relevant, move it to a normal title and leave a redirect.

#!/usr/bin/perl
use strict;                   # 'strict' insists that all variables be declared
use diagnostics;              # 'diagnostics' expands the cryptic warnings
use open 'utf8';

use lib $ENV{HOME} . '/public_html/wp/modules'; # path to perl modules
require 'bin/perlwikipedia_utils.pl'; # my own packages, this and the one below
require 'bin/fetch_articles.pl';
require 'bin/rm_extra_html.pl';
require 'strip_accents_and_stuff.pl';
require 'lists_utils.pl';
undef $/; # undefines the separator. Can read one whole file in one scalar.

# Collect the philosophy articles from the philosophy categories. Merge them into the [[Index of philosophy]] on Wikipedia.
# Remove redlinks, redirects, and disambig pages. Submit to Wikipedia the log of changes and newly detected categories. This runs daily.

MAIN: {

  $| = 1; # flush the buffer each line
  
  my ($line, @lines, %articles, $letter, %blacklist, @articles_from_cats, $text, $file, $sleep, $attempts, $edit_summary, $todays_log);
  my ($list_of_categories, @letters, @philosophy_categories, @philosophers_categories, @other_categories, $log_file, $count);
  my ($articles_from_cats_file, $all_phil_arts_file, @new_categories, %current_categories, %all_articles, $philosophers_logfile, $prefix, $Editor);
  @letters=("0-9", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z");

  # Files involved (they are many).
  $list_of_categories='List_of_philosophy_categories.wiki';
  $log_file="User:Philosobot/Changes_to_phillists.wiki";

  # The files with the .txt extension are local, they don't get submitted to Wikipedia.
  $articles_from_cats_file='All_philosophy_from_cats.txt';
  $all_math_arts_file='All_philosophy.txt';
  $philosophers_logfile='Philosophers_log.txt';

  $prefix = "Index of philosophy";
  
  $sleep = 5; $attempts=500; # necessary to fetch data from Wikipedia and submit
  $Editor=wikipedia_login();

  # Get today's articles found in categories
  &read_categories_from_list(\@philosophy_categories,\@philosopher_categories,\@other_categories,
                             $list_of_categories);
  &fetch_articles(\@philosophy_categories, \@articles_from_cats, \@new_categories);
  @articles_from_cats=&randomize_array(@articles_from_cats); # to later identify entries differning only by capitals

  # articles which we will not allow in the phil list for various reasons
  &put_redlinks_on_blacklist($prefix, \@letters, \%blacklist);
  &put_philosophers_on_blacklist_and_user_selected_also(\%blacklist);
  &put_redirects_on_blacklist(\%blacklist, $articles_from_cats_file, \@articles_from_cats);

  # go letter by letter, and merge the new entries
  foreach $letter (@letters){
    $file = "$prefix ($letter).wiki"; 
  
    $text=wikipedia_fetch($Editor, $file, $attempts, $sleep);  # fetch the lists from Wikipedia
    exit (0) if ($text =~ /^\s*$/);                      # quit if can't get any of the lists 

    # the heart of the code
    $text = &merge_new_entries_from_categories($letter, $text, \@articles_from_cats, \%blacklist, \%all_articles);

    $edit_summary="Daily update. See the log at [[User:Philosobot/Changes to phillists]].";
    wikipedia_submit($Editor, $file, $edit_summary, $text, $attempts, $sleep);

  }
  &post_newly_detected_categories(\@philosophy_categories, \@philosopher_categories, \@other_categories, \@new_categories);

  # create the log of changes to the phil articles. Merge with the changes to philosopher articles. Submit.
  $todays_log=&process_log_of_todays_changes(\%all_articles, \%blacklist, $all_phil_arts_file); # changes to the philosophy articles
  open(FILE, "<$philosophers_logfile"); $text=<FILE>; close(FILE);
  $text =~ s/^==.*?==\s*//g; $text =~ s/(^|\n)(:.)/"$1: Philosophers" . lc($2)/eg;
  $todays_log = $todays_log . "----\n" . $text;
  &merge_logs_and_submit($todays_log, $log_file);
}

# articles which we will not allow in the [[Index of philosophy]]
sub put_philosophers_on_blacklist_and_user_selected_also {
  my $blacklist=shift;
  my ($line, @lines);

  # read blacklist from file
  open (FILE,  '<', "User:Philosobot/Blacklist.wiki");      @lines = split ("\n", <FILE>); close(FILE);
  foreach $line (@lines) {
    next unless ($line =~ /\[\[(.*?)\]\]/);
    $line = $1; $line =~ s/^(.)/uc($1)/eg; # upcase
    $blacklist->{$line}= '(is in [[User:Philosobot/Blacklist]])';
  }

  # blacklist the philosophers (which already are in the [[Index of philosophers]]) 
  open (FILE,  '<', "All_philosophers.txt");  @lines = split ("\n", <FILE>);  close(FILE);
  foreach $line (@lines) {
    $blacklist->{$line}= '(is in the [[Index of philosophers]])';
  }
}

# the heart of the code
sub merge_new_entries_from_categories{

  my ($link, $link_stripped, @links, %articles);
  my ($letter, $text, $articles_from_cats, $blacklist, $all_articles)=@_;

  $text = &rm_extra_html($text); # replace &amp; with &, etc. This was needed for one run only I think.

  @links=split("\n", $text); 
  foreach $link (@links){
    if ($link =~ /^\[\[(.*?)(\||\]\])/){ # extract the link
      $link=$1;
    }else{
      $link="";
    }
  }
  @links=(@links, @$articles_from_cats); # append the randomized @articles_from_cats to @links
  
  # put into hash the entries starting with current letter
  foreach $link (@links){
    
    next if (exists $blacklist->{$link});  # don't add blacklisted items to the list of topics
    next if ($link =~ /(talk|wikipedia|template|category|user):/i);  # ignore talk pages, templates, etc
    next if ($link =~ /Index of philosophy \(/i); # do not put links to lists themselves, that's stupid
    next if ($link =~ /^\s*$/); # ignore empty links

    # Get a copy of the link stripped of accents and non-alphanumberic.
    # Will use it for sorting.
    $link_stripped = &strip_accents_and_stuff ($link); 
    
    # now, do not deal with any articles except the current letter
    if ($letter eq "0-9"){
      next unless ($link_stripped =~ /^[0-9]/);
    }else{
      next unless ($link_stripped =~ /^$letter/i);
    }
    
    $articles{$link_stripped} = "\[\[$link\]\] \[\[Talk:$link\| \]\] -- "; # put them all in a hash
    $all_articles->{$link}=1; # this will be exported out of this function
  }
  
  # split into sections and collect all data in $text
  &split_into_sections (\%articles);
  $text="__NOTOC__\n{{PhilTopicTOC}}\n";
  foreach (sort { $a cmp $b } keys %articles) {
    $text = $text . $articles{$_} . "\n";
  }
  $text = $text . "\n[[Category:Philosophy-related lists|Philosophy $letter]]\n[[Category:Indexes of articles|Philosophy $letter]]\n";
  return $text;
}

sub post_newly_detected_categories {

  my ($philosophy_categories, $philosopher_categories, $other_categories, $new_categories)=@_;
  my (%current_categories, $text, $line, $philosopher_cat_list, $sleep, $attempts, $edit_summary, $file, $Editor);

  # add to the newly discovered philosophy categories the philosopher categories discovered when running that script
  $philosopher_cat_list = "New_philosopher_categories.txt";
  open (FILE, "<", $philosopher_cat_list); $text =  <FILE>; close(FILE);
  @$new_categories = (@$new_categories, split ("\n", $text));

  # current categories
  foreach $line (@$philosophy_categories  ){ $current_categories{$line}=1;  }
  foreach $line (@$philosopher_categories){ $current_categories{$line}=1;  }
  foreach $line (@$other_categories        ){ $current_categories{$line}=1;  }

  # see which of the @$new_categories are truly new
  $text="";
  foreach $line (@$new_categories){
    next if ( exists $current_categories{$line} );
    next unless ($line =~ /Category:/);
    $text = $text . "\[\[:$line\]\] -- \n";
  }

  $file              = "User:Philosobot/New_phil_categories.wiki";
  $Editor=wikipedia_login();
  $sleep = 5; $attempts=500;  $edit_summary="Today's new philosophy categories.";
  wikipedia_submit($Editor, $file, $edit_summary, $text, $attempts, $sleep);
}

sub merge_logs_and_submit{

  my ($log_file, $todays_log, $combined_log, @days, $sleep, $attempts, $edit_summary);
  my ($Editor);
  ($todays_log, $log_file)=@_;
  
  # Read in the log from previous days (from the disk), append to it today's log
  open (FILE, "<$log_file"); $combined_log=<FILE>; close(FILE);
  $combined_log =~ s/(^.*?\n)(==.*?)$/$1$todays_log\n$2/sg; # 

  # keep only the last month or so
  @days = split ("\n==", $combined_log);
  @days = splice (@days, 0, 39);
  $combined_log = join ("\n==", @days);

  # submit the log file, and write the logfile back to disk (away from wikipedia vandals)
  $Editor=wikipedia_login();
  $sleep = 5; $attempts=500; $edit_summary="Today's changes to the [[Index of philosophy]].";
  wikipedia_submit($Editor, $log_file, $edit_summary, $combined_log, $attempts, $sleep);
  open (FILE, ">$log_file"); print FILE "$combined_log\n"; close(FILE); # write new log to disk
}

Category:Tools by Gregbard