#!/usr/local/bin/perl -w
#################################################################
# Program: checkwiki.pl
# Descrition: Scan all pages of a Wikipedia-Project (dump or live) for errors
# Author: Stefan Kühn
# Licence: GPL
#################################################################
#################################################################
# Syntax
# perl checkwiki.pl -p enwiki -m live
#################################################################
# New features, last changes and discussion
# http://de.wikipedia.org/wiki/Benutzer:Stefan_Kühn/Check_Wikipedia
#################################################################
# Error exception
$SIG{__DIE__} = \&die_error;
$SIG{__WARN__} = \&warn_error;
use strict;
use warnings;
our $VERSION = '2013-02-15';
#################################################################
# notice
# delete_old_errors_in_db --> Problem with deleting of errors in loadmodus
# delete_deleted_article_from_db --> Problem old articles
#################################################################
# Load Modules
#################################################################
use DBI;
use Getopt::Long qw(:config bundling no_auto_abbrev no_ignore_case);
use LWP::UserAgent;
use URI::Escape;
#################################################################
# declare_global_directorys
#################################################################
our $dump_directory = '/mnt/user-store/dumps/store/'; # toolserver
our $dump_directory2 = '/mnt/user-store/dumps/tmp/';
# our $dump_directory = '../../dump/'; # home or usb
our $output_directory = '/mnt/user-store/sk/data/checkwiki/';
our $input_directory_new = '/mnt/user-store/sk/data/new_article/';
our $input_directory_change = '/mnt/user-store/sk/data/last_changes/';
our $output_templatetiger = '/mnt/user-store/sk/data/templatetiger/';
our $output_geo = '/mnt/user-store/sk/data/geo/';
#our $dump_filename = '/mnt/user-store/dump/dewiki-20080607-pages-articles.xml'; #'Wikipedia-20080502083556.xml';
our $dump_filename = '';
#$dump_filename ='../../dump/dewiki-20071217-pages-articles.xml';
#################################################################
# Declaration of variables (global)
#################################################################
our $quit_program = 'no'; # quit the program (yes,no), for quit the programm in an emergency
our $quit_reason = ''; # quit the program reason
our $test_programm = 'true'; # only for program tests
our $dump_or_live = ''; # scan modus (dump, live, only)
our $silent_modus = 0; # silent modus (very low output at screen) for batch
our $test_modus = 0; # silent modus (very low output at screen) for batch
our $starter_modus = 0; # to update in the loadmodus the cw_starter table
our $load_modus_done = 1; # done article from db
our $load_modus_new = 1; # new article from db
our $load_modus_dump = 1; # new article from db
our $load_modus_last_change = 1; # last_change article from db
our $load_modus_old = 1; # old article from db
our $details_for_page = 'no'; # yes/no durring the scan you can get more details for a article scan
our $time_start = time(); # start timer in secound
our $time_end = time(); # end time in secound
our $date = 0; # date of dump "20060324"
our $line_number = 0; # number of line in dump
our $project; # name of the project 'dewiki'
our $language = ''; # language of dump 'de', 'en';
our $page_number = 0; # number of pages in namesroom 0
our $base = ''; # base of article, 'http://de.wikipedia.org/wiki/Hauptseite'
our $home = ''; # base of article, 'http://de.wikipedia.org/wiki/'
our @namespace; # namespace values
# 0 number
# 1 namespace in project language
# 2 namespace in english language
our $namespaces_count = -1; # number of namespaces
our @namespacealiases; # namespacealiases values
# 0 number
# 1 namespacealias
our $namespacealiases_count= -1; # number of namespacealiases
our @namespace_cat; #all namespaces for categorys
our @namespace_image; #all namespaces for images
our @namespace_templates; #all namespaces for templates
our @magicword_defaultsort;
our @magicword_img_thumbnail;
our @magicword_img_manualthumb;
our @magicword_img_right;
our @magicword_img_left;
our @magicword_img_none;
our @magicword_img_center;
our @magicword_img_framed;
our @magicword_img_frameless;
our @magicword_img_page;
our @magicword_img_upright;
our @magicword_img_border;
our @magicword_img_sub;
our @magicword_img_super;
our @magicword_img_link;
our @magicword_img_alt;
our @magicword_img_width;
our @magicword_img_baseline;
our @magicword_img_top;
our @magicword_img_text_top;
our @magicword_img_middle;
our @magicword_img_bottom;
our @magicword_img_text_bottom;
# Wiki-special variables
our @live_article; # to-do-list for live (all articles to scan)
our $current_live_article = -1; # line_number_of_current_live_article
our $number_of_live_tests = -1; # Number of articles for live test
our $current_live_error_scan = -1; # for scan every 100 article of an error
our @live_to_scan ; # article of one error number which should be scanned
our $number_article_live_to_scan = -1; # all article from one error
our @article_was_scanned; #if an article was scanned, this will insert here
our $xml_text_from_api = ''; # the text from more then one articles from the API
our $error_counter = -1; # number of found errors in all article
our @error_description; # Error Description
# 0 priority in script
# 1 title in English
# 2 description in English
# 3 number of found (only live scanned)
# 4 priority of foreign language
# 5 title in foreign language
# 6 description in foreign language
# 7 number of found in last scan (from statistic file)
# 8 all known errors (from statistic file + live)
# 9 XHTML translation title
# 10 XHTML translation description
our $number_of_error_description = -1; # number of error_description
our $max_error_count = 50; # maximum of shown article per error
our $maximum_current_error_scan = -1; # how much shold be scanned for reach the max_error_count
our $rest_of_errors_not_scan_yet = '';
our $number_of_all_errors_in_all_articles = 0; #all errors
our $for_statistic_new_article = 0;
our $for_statistic_last_change_article = 0;
our $for_statistic_geo_article = 0;
our $for_statistic_number_of_articles_with_error = 0;
###########################
# files
###########################
our $live_filename = 'input_for_live.txt';
our $output_live_wiki = 'output_for_wikipedia.txt';
our $output_dump_wiki = 'output_for_wikipedia_dump.txt';
our $error_list_filename = 'error_list.txt';
our $error_list_filename_only = 'error_list_only.txt';
our $error_list_filename_dump = 'error_list_dump.txt'; #all errors from the last dump scan
our $error_list_filename_backup = 'error_list_dump_backup.txt';
our $error_statistic_filename = 'error_statistic.txt';
our $error_statistic_filename_only = 'error_statistic_only.txt';
our $error_statistic_filename_list = 'error_statistic_list.txt';
our $translation_file = 'translation.txt';
our $error_list_filename_30 = 'error_list_error_030.txt';
our $error_list_filename_every = 'error_list_error'; # for all errors
our $error_geo_list_filename = 'error_geo_list.txt';
our $error_geo_list_filename_only = 'error_geo_list_only.txt';
our $error_geo_list_filename_html = 'error_geo_list.htm';
our $error_geo_list_filename_only_html = 'error_geo_list_only.htm';
our $log_file = 'log.txt';
our $templatetiger_filename = '';
our @inter_list = ( 'af', 'als', 'an', 'ar',
'bg', 'bs',
'ca', 'cs', 'cy',
'da', 'de',
'el', 'en', 'eo', 'es', 'et', 'eu',
'fa', 'fi', 'fr', 'fy',
'gl', 'gv',
'he', 'hi', 'hr', 'hu',
'id', 'is', 'it',
'ja', 'jv',
'ka', 'ko',
'la', 'lb', 'lt',
'ms',
'nds', 'nds_nl', 'nl', 'nn', 'no',
'pl', 'pt',
'ro', 'ru',
'sh', 'simple', 'sk', 'sl', 'sr', 'sv', 'sw',
'ta', 'th', 'tr',
'uk', 'ur',
'vi', 'vo',
'yi',
'zh'
);
our @foundation_projects = ( 'wikibooks', 'b',
'wiktionary', 'wikt',
'wikinews', 'n',
'wikiquote', 'q',
'wikisource', 's',
'wikipedia', 'w',
'wikispecies', 'species',
'wikimedia', 'foundation', 'wmf',
'wikiversity', 'v',
'commons',
'meta', 'metawikipedia', 'm',
'incubator',
'mw',
'quality',
'bugzilla', 'mediazilla',
'nost',
'testwiki'
);
# current time
our ($akSekunden, $akMinuten, $akStunden, $akMonatstag, $akMonat,
$akJahr, $akWochentag, $akJahrestag, $akSommerzeit) = localtime(time);
our $CTIME_String = localtime(time);
$akMonat = $akMonat + 1;
$akJahr = $akJahr + 1900;
$akMonat = "0".$akMonat if ($akMonat<10);
$akMonatstag = "0".$akMonatstag if ($akMonatstag<10);
$akStunden = "0".$akStunden if ($akStunden<10);
$akMinuten = "0".$akMinuten if ($akMinuten<10);
our $translation_page = ''; # name of the page with translation for example in de: "Wikipedia:WikiProject Check Wikipedia/Übersetzung"
our $start_text = '';
$start_text = $start_text ."The WikiProject '''Check Wikipedia''' will help to clean up the syntax of Wikipedia and to find some other errors.\n";
$start_text = $start_text ."\n";
$start_text = $start_text ."'''Betatest''' - At the moment the script has some bugs and not every error on this page is an actual error. \n";
$start_text = $start_text ."\n";
our $description_text = '';
$description_text = $description_text ."== Project description in English == \n";
$description_text = $description_text ."* '''What is the goal of this project?'''\n";
$description_text = $description_text ."** This project should help to clean up the data of all articles in many different languages.\n";
$description_text = $description_text ."** If we have a clear and clean syntax in all articles more projects (for example: Wikipedia-DVD) can use our data more easily.\n";
$description_text = $description_text ."** The project was inspired by [[:en:Wikipedia:WikiProject Wiki Syntax]].\n";
$description_text = $description_text ."** In order to use the data of a Wikipedia project without the Mediawiki software you need to write a parser. If many articles include wrong syntax it is difficult to program the parser since it needs to be complex enough to recognize the syntax errors.\n";
$description_text = $description_text ."** This project helps to find many errors in all kinds of language and will support many languages in the future. \n";
$description_text = $description_text ."\n";
$description_text = $description_text ."* '''How does it work?'''\n";
$description_text = $description_text ."** The script scans every new [http://dumps.wikimedia.org dump] and creates a list of articles with errors.\n";
$description_text = $description_text ."** The script scans all articles on the list on a daily basis to create a new list for users, omitting already-corrected articles.\n";
$description_text = $description_text ."** The script is written in Perl by: [[:de:User:Stefan Kühn|Stefan Kühn]] "."\n";
$description_text = $description_text ."** You can download the script [http://toolserver.org/~sk/checkwiki/checkwiki.pl here]. It is licensed under GPL."."\n";
$description_text = $description_text ."** [[:de:User:Stefan Kühn/Check Wikipedia|New features, last changes and discussion]]. "."\n";
$description_text = $description_text ."\n";
$description_text = $description_text ."* '''What can you do?'''\n";
$description_text = $description_text ."** The script creates a new error page at the toolserver every day. Please copy and paste the daily updated page at the toolserver (See downloads) to this page here. Attention: That page is a UTF-8 document. In case your browser cannot display the file in UTF-8 you can copy it into a text editor (for example: Notepad++) and convert it to UTF-8. \n";
$description_text = $description_text ."** You can fix an error in one or more articles. \n";
$description_text = $description_text ."** You can delete all fixed articles from this list. \n";
$description_text = $description_text ."** If all articles in one category have been fixed you can delete this category. \n";
$description_text = $description_text ."** You can suggest a new category of errors to the author of the script. \n";
$description_text = $description_text ."** You can also inform the author if you want this project to be implemented into your language's Wikipedia. \n";
$description_text = $description_text ."\n";
$description_text = $description_text ."* '''Please don't… '''\n";
$description_text = $description_text ."** insert an article by hand since it will disappear from the list with the next automatic update of this page. \n";
$description_text = $description_text ."** try to fix spelling mistakes within this page since all manual changes will disappear as well with the next update. Instead, send an e-mail or message to the author so he can fix the spelling in the script. \n";
$description_text = $description_text ."\n";
our $category_text = '';
our $top_priority_script = 'Top priority';
our $top_priority_project = '';
our $middle_priority_script = 'Middle priority';
our $middle_priority_project = '';
our $lowest_priority_script = 'Lowest priority';
our $lowest_priority_project = '';
our $dbh; # DatenbaaseHandler
###############################
# variables for one article
###############################
$page_number = $page_number + 1;
our $title = ''; # title of the current article
our $page_id = -1; # page id of the current article
our $revision_id = -1; # revision id of the current article
our $revision_time = -1; # revision time of the current article
our $text = ''; # text of the current article (for work)
our $text_origin = ''; # text of the current article origin (for save)
our $text_without_comments = ''; # text of the current article without_comments (for save)
our $page_namespace = -100; # namespace of page
our $page_is_redirect = 'no';
our $page_is_disambiguation = 'no';
our $page_categories = '';
our $page_interwikis = '';
our $page_has_error = 'no'; # yes/no error in this page
our $page_error_number = -1; # number of all article for this page
our @comments; # 0 pos_start
# 1 pos_end
# 2 comment
our $comment_counter = -1; #number of comments in this page
our @category; # 0 pos_start
# 1 pos_end
# 2 category Test
# 3 linkname Linkname
# 4 original [[Category:Test|Linkname]]
our $category_counter = -1;
our $category_all = ''; # all categries
our @interwiki; # 0 pos_start
# 1 pos_end
# 2 interwiki Test
# 3 linkname Linkname
# 4 original [[de:Test|Linkname]]
# 5 language
our $interwiki_counter = -1;
our @lines; # text seperated in lines
our @headlines; # headlines
our @section; # text between headlines
undef(@section);
our @lines_first_blank; # all lines where the first character is ' '
our @templates_all; # all templates
our @template; # templates with values
# 0 number of template
# 1 templatename
# 2 template_row
# 3 attribut
# 4 value
our $number_of_template_parts = -1; # number of all template parts
our @links_all; # all links
our @images_all; # all images
our @isbn; # all ibsn of books
our @ref; # all ref
our $page_has_geo_error = 'no'; # yes/no geo error in this page
our $page_geo_error_number = -1; # number of all article for this page
our $end_of_dump = 'no'; # when last article from dump scan then 'yes', else 'no'
our $end_of_live = 'no'; # when last article from live scan then 'yes', else 'no'
our $statistic_online_page = -1; # number of pages online from metadata-statistic
check_input_arguments();
open_db();
open_file() if ($quit_program eq 'no'); # logfile, dumpfile, metadata (API, File)
get_error_description() if ($quit_program eq 'no'); # all errordescription from this script
load_text_translation() if ($quit_program eq 'no'); # load translation from wikipage
output_errors_desc_in_db() if ($quit_program eq 'no'); # update the database with newest error description
output_text_translation_wiki() if ($quit_program eq 'no'); # output the new wikipage for translation
load_article_for_live_scan() if ($quit_program eq 'no'); # only for live
scan_pages() if ($quit_program eq 'no'); # scan all aricle
close_file(); # close dump or templatetiger-file
update_table_cw_error_from_dump() if ($quit_program eq 'no');
delete_deleted_article_from_db() if ($quit_program eq 'no');
delete_article_from_table_cw_new() if ($quit_program eq 'no');
delete_article_from_table_cw_change() if ($quit_program eq 'no');
update_table_cw_starter();
#output_errors() if ($quit_program eq 'no');
output_little_statistic() if ($quit_program eq 'no'); # print counter of found errors
output_duration() if ($quit_program eq 'no'); # print time at the end
print $quit_reason if ($quit_reason ne '');
close_db();
close_logfile();
print 'finish'."\n";
#################################################################
#################################################################
#################################################################
#################################################################
#################################################################
sub get_time_string{
my ($aakSekunden, $aakMinuten, $aakStunden, $aakMonatstag, $aakMonat,
$aakJahr, $aakWochentag, $aakJahrestag, $aakSommerzeit) = localtime(time);
$aakMonat = $aakMonat + 1;
$aakJahr = $aakJahr + 1900;
$aakMonat = "0".$aakMonat if ($aakMonat<10);
$aakMonatstag = "0".$aakMonatstag if ($aakMonatstag<10);
$aakStunden = "0".$aakStunden if ($aakStunden<10);
$aakMinuten = "0".$aakMinuten if ($aakMinuten<10);
$aakSekunden = "0".$aakSekunden if ($aakSekunden<10);
my $result = $aakJahr.$aakMonat.$aakMonatstag.' '.$aakStunden.$aakMinuten.$aakSekunden;
return($result);
}
sub check_input_arguments {
my $load_mode;
if (!GetOptions ('load=s' => \$load_mode,
'm=s' => \$dump_or_live,
'p=s' => \$project,
'silent' => \$silent_modus,
'starter' => \$starter_modus,
'test' => \$test_modus)) {
return;
}
# Check argument value for scan mode.
if ($dump_or_live ne 'dump' &&
$dump_or_live ne 'only' &&
$dump_or_live ne 'live') {
$quit_reason .= "Mode unknown, for example: \"-m dump/live/only\".\n\n";
$quit_program = 'yes';
}
# Check that a project name is given.
if (!defined ($project)) {
$quit_reason .= "No project name, for example: \"-p dewiki\".\n\n";
$quit_program = 'yes';
}
# Split load mode.
if (defined ($load_mode) && $dump_or_live eq 'live') {
my %LoadOptions = map { $_ => 1; } split (/\//, $load_mode);
$load_modus_done = exists ($LoadOptions {'done'}); # done article from db
$load_modus_new = exists ($LoadOptions {'new'}); # new article from db
$load_modus_dump = exists ($LoadOptions {'dump'}); # new article from db
$load_modus_last_change = exists ($LoadOptions {'last_change'}); # last_change article from db
$load_modus_old = exists ($LoadOptions {'old'}); # old article from db
}
if ($quit_program eq 'yes') {
# End of script, because no correct parameter
$quit_reason .= "Use for scan a dump\n";
$quit_reason .= "perl checkwiki.pl -p dewiki -m dump\n";
$quit_reason .= "perl checkwiki.pl -p nds_nlwiki -m dump\n";
$quit_reason .= "perl checkwiki.pl -p nds_nlwiki -m dump --silent\n";
$quit_reason .= "perl checkwiki.pl -p nds_nlwiki -m dump --silent --test\n\n";
$quit_reason .= "Use for scan a list of pages live\n";
$quit_reason .= "perl checkwiki.pl -p dewiki -m live\n";
$quit_reason .= "perl checkwiki.pl -p dewiki -m live --silent\n";
$quit_reason .= "perl checkwiki.pl -p dewiki -m live --silent --test\n";
$quit_reason .= "perl checkwiki.pl -p dewiki -m live --load new/done/dump/last_change/old\n";
$quit_reason .= "\n";
} else {
$language = $project;
$language =~ s/source$//;
$language =~ s/wiki$//;
print "\n";
if (!$silent_modus) {
print "#########################################################\n";
print '######## checkwiki.pl - Version '.$VERSION.' ########'."\n";
print "#########################################################\n";
}
two_column_display('start:', $akJahr.'-'.$akMonat.'-'.$akMonatstag.' '.$akStunden.':'.$akMinuten);
two_column_display('project:', $project);
if (!$silent_modus) {
my $modus_output = '';
$modus_output = 'scan a dump' if ($dump_or_live eq 'dump');
$modus_output = 'scan live' if ($dump_or_live eq 'live');
$modus_output = 'scan a dump only some errors' if ($dump_or_live eq 'only');
two_column_display ('Modus:', $dump_or_live. ' ('.$modus_output.')');
}
if ($test_modus) { #modus only for test
$project .= '_test';
two_column_display ('Test-Modus:', $project.'!!!');
}
}
}
sub open_db{
#################################################################
# DB
#################################################################
#load password from local file
open(PWD, ";
if ($test =~ /^pass=/ ) {
$password = $test;
$password =~ s/^pass=//g;
$password =~ s/\n//g;
}
}
while (eof(PWD) != 1);
close(PWD);
my $hostname = `hostname`; # get name of host (PC-Name)
chomp($hostname);
two_column_display ('host:', $hostname);
#Connect to database u_sk_yarrow
if ( $hostname =~ 'kunopc'){
$dbh = DBI->connect( 'DBI:mysql:u_sk_yarrow', # local
'sk',
$password ,
{
RaiseError => 1,
AutoCommit => 1
}
) or die "Database connection not made: $DBI::errstr" . DBI->errstr;
} else {
$dbh = DBI->connect( 'DBI:mysql:u_sk_yarrow:host=sql', # Toolserver
'sk',
$password ,
{
RaiseError => 1,
AutoCommit => 1
}
) or die "Database connection not made: $DBI::errstr" . DBI->errstr;
}
$password = '';
}
sub close_db{
# close database
$dbh->disconnect();
}
sub close_logfile{
# close logfile
close (LOGFILE) if (!$starter_modus);
}
###################################################################################
sub get_error_description{
# this subroutine check out the error description of all possible errors
print_line();
two_column_display ('load:', 'all error description from script');
error_list('get_description');
# count the number of error description
$number_of_error_description = 1; # first error is error with number 1
while (defined($error_description[$number_of_error_description][1]) ) {
#print $number_of_error_description.' '. $error_description[$number_of_error_description][1]."\n";
$number_of_error_description = $number_of_error_description + 1;
}
# set all known error description to a basic level
for (my $i = 1; $i <= $number_of_error_description; $i++) {
#$error_description[$i][0] = -1; # set in error
#$error_description[$i][1] = ''; # set in error
#$error_description[$i][2] = ''; # set in error
$error_description[$i][3] = 0;
$error_description[$i][4] = -1;
$error_description[$i][5] = '';
$error_description[$i][6] = '';
$error_description[$i][7] = 0;
$error_description[$i][8] = 0;
$error_description[$i][9] = '';
$error_description[$i][10] = '';
}
my $output_number = $number_of_error_description -1;
two_column_display ('error description:', $output_number.' in script');
}
###################################################################################
sub open_file{
# create subdirectory
#print $output_directory.$project."\n";
if (not (-e $output_directory.$project )) {
print 'create directory:'."\t". $output_directory.$project."\n";
#mkdir($output_directory.$project ,0777);
system ('mkdir -p '.$output_directory.$project);
}
################################
# open logfile
my $log_filename = $output_directory.$project.'/'.$project.'_'.$log_file;
open (LOGFILE, '+>'.$log_filename) if (!$starter_modus);
################################
# if new dump is available
if ($dump_or_live eq 'dump') {
$dump_filename = search_for_last_dump();
two_column_display ('Dump_filename:', $dump_filename) if (!$silent_modus);
my $last_dump_filename = $output_directory.$project.'/'.$project.'_last_dump_name.txt';
two_column_display ('last_dump_filename:', $output_directory.$project.'/');
two_column_display ('', $project.'_last_dump_name.txt');
if (not (-e $last_dump_filename)) {
# create the file if not exist
system ('touch '.$last_dump_filename);
two_column_display ('create last_dump_file:', $project.'_last_dump_name.txt');
open (LAST_DUMP_NAME_FIRST, '+>'.$last_dump_filename);
print LAST_DUMP_NAME_FIRST 'x';
close(LAST_DUMP_NAME_FIRST);
}
#read the last name
#print 'check old dumpname'."\n";
open (LAST_DUMP_NAME, '<'.$last_dump_filename);
my $last_dump_name_old = '';
$last_dump_name_old = ;
$last_dump_name_old = '' if not defined;
chomp ($last_dump_name_old);
close(LAST_DUMP_NAME);
#get date from dumpfile
our $dump_date_for_output = $dump_filename;
$dump_date_for_output =~ s/^[^\-]-//g;
$dump_date_for_output =~ s/^[^0-9]+//g;
$dump_date_for_output =~ s/[^0-9]+$//g;
if (length($dump_date_for_output) >=8){
$dump_date_for_output = substr($dump_date_for_output,0,4).'-'.substr($dump_date_for_output,4,2).'-'.substr($dump_date_for_output,6,2);
}
#print $dump_date_for_output."\n";
if ($dump_filename ne $last_dump_name_old ) {
# if not the newest dump then start dump scan
two_column_display ('Last scanned dump:', $last_dump_name_old);
two_column_display ('Current found dump:', $dump_filename);
open (LAST_DUMP_NAME, '>'.$last_dump_filename);
print LAST_DUMP_NAME $dump_filename;
close(LAST_DUMP_NAME);
#print 'nice -n 5 perl checkwiki.pl -p '.$project.' -m dump' ."\n";
# if ($dump_or_live eq 'live') {
# print "\n\n";
# system ('nice -n 5 perl checkwiki.pl -p '.$project.' -m dump --silent') ;
# print "\n\n";
# }
}
#update last_dump time for project in database
my $sql_text = "update /* SLOW_OK */ cw_project set last_dump ='".$dump_date_for_output."' where project = '". $project ."';";
my $sth = $dbh->prepare( $sql_text );
$sth->execute;
#delete old list of articles from last dumpscan in table cw_dumpscan
my $sql_text2 = "delete /* SLOW_OK */ from cw_dumpscan where project = '". $project ."';";
$sth = $dbh->prepare( $sql_text2 );
$sth->execute;
}
################################
if ($dump_or_live eq 'dump' or $dump_or_live eq 'only') {
#print "last=x".$dump_filename."x\n";
# check for existens dump
my $full_dump_path_filename = $dump_directory.$project.'/'.$dump_filename;
if (not -e $full_dump_path_filename) {
$full_dump_path_filename = $dump_directory2.'/'.$dump_filename;
}
#print $full_dump_path_filename."\n";
if ($dump_filename ne '' and -e $full_dump_path_filename ) {
#print 'Data: '."\t\t"."$dump_directory$dump_filename\n";
#open dump
open(DUMP, "bzip2 -d -q <$full_dump_path_filename |");
#read_and_write_metadata_from_url();
} else {
$quit_program = 'yes';
$quit_reason = $quit_reason. "file '$full_dump_path_filename'". " don't exist!\n";
}
#################
# Templatetiger
#################
$templatetiger_filename = $output_templatetiger.$project.'/'.$project.'_templatetiger.txt';
if (not (-e $output_templatetiger.$project )) {
two_column_display ('create new subdirectory', 'templatetiger');
system ('mkdir -p '.$output_templatetiger.$project);
}
if (-e $templatetiger_filename ) {
two_column_display ('delete old TT-file:', $project.'_templatetiger.txt');
system ('rm -f '.$templatetiger_filename) ;
}
open (TEMPLATETIGER, '>>'.$templatetiger_filename);
#################
# GEO Export
#################
our $geo_export_filename = $output_geo.$project.'/'.$project.'_coordinates.txt';
if (not (-e $output_geo.$project )) {
two_column_display ('create new subdirectory', 'geo');
#mkdir($output_geo.$project ,0777);
system ('mkdir -p '.$output_geo.$project);
}
if (-e $geo_export_filename ) {
two_column_display ('Delete old Geo-file', $geo_export_filename);
system ('rm -f '.$geo_export_filename) ;
}
}
# delete old error_list
if ($quit_program eq 'no' ) {
read_and_write_metadata_from_url();
load_metadata_from_file();
}
}
sub search_for_last_dump {
# search in dump_directory for the last XML-file of a project
my $last_file ='';
print_line();
two_column_display ('search dump in:', $dump_directory);
two_column_display ('search dump in:', $dump_directory2);
my @xml_files1 = glob($dump_directory .$project.'/*-pages-articles.xml.bz2'); # ../store
my @xml_files2 = glob($dump_directory2.$project.'*-pages-articles.xml.bz2'); # ../tmp
my @xml_files = (@xml_files1, @xml_files2); # add both file-arrays
my $count_xml_files = @xml_files;
for (my $i = 0; $i < $count_xml_files; $i++) {
# List of all xml-files in dump_directory
my $byte = -s $xml_files[$i];
#print $xml_files[$i].' '.$byte."\n";
$xml_files[$i] =~ s/(.)+\///g;
my $project_test = $project;
$project_test =~ s/_test$//;
if (( index($xml_files[$i], $project.'-') == 0 # only this project
or index($xml_files[$i], $project_test.'-') == 0 ) #
and $byte > 0 ) { # only more then 0 bytes files
#the last project dump (more then 0 byte)
if ($xml_files[$i] =~ /^$project(_test)?-[0-9]/) {
#print "\t".$xml_files[$i]."\n";
$last_file = $xml_files[$i];
}
}
}
if ($last_file eq '' and $dump_or_live ne 'live') { # stop if dump scan , run if the program will scan live
# No file found
$quit_program = 'yes';
$quit_reason = $quit_reason.$count_xml_files.' XML-files found in folder '.$dump_directory."\n";
$quit_reason = $quit_reason.'Found no XML-file for project: '.$project."\n";
}
@xml_files = (); # free memory
return($last_file);
}
######################################################################
sub load_article_for_live_scan{
if ($dump_or_live eq 'live' ) {
# open list for live
print_line();
two_column_display('Load article for:', 'live scan') if (!$silent_modus);
#print 'Data: '."\t\t".$output_directory.$project.'/'.$project.'_'.$error_list_filename ."\n";
if (not (-e $output_directory.$project.'/'.$project.'_'.$error_list_filename )){
#$quit_program = 'yes';
#$quit_reason = $quit_reason. "file:" .$output_directory.$project.'/'.$project.'_'.$error_list_filename. " don't exist!\n";
#print 'file:' .$output_directory.$project.'/'.$project.'_'.$error_list_filename. " don't exist!\n";
print 'create '.$output_directory.$project.'/'.$project.'_'.$error_list_filename. "\n";
system ('touch '.$output_directory.$project.'/'.$project.'_'.$error_list_filename);
} else {
#read articles(live)
new_article(250) if ($load_modus_new); # get 250 new article last days
last_change_article(50) if ($load_modus_last_change); # get 10 change article last days
get_done_article_from_database(250) if ($load_modus_done); # get 250 article which are set as done in the database
# which are not scan_live - NEW: with table cw_dumpscan
get_oldest_article_from_database(250) if ($load_modus_old); # get 250 article which are the date of last_scan is very old (dump_scan)
#old
#article_last_live_scan(); # get all article from last live scan, where the script found errors
# very long in many languages (maybe later)
# replace with done articles
#article_with_error_from_dump_scan(); # get all articles error from the last dump scan
# replace with article_with_error_from_dump_scan2
#article_with_error_from_dump_scan2() if ($load_modus_dump); # get 250 articles of each error from the last dump scan,
#geo_error_article(); # get all articles with geo errors last days
# sort all articles (new + live)
@live_article = sort(@live_article);
# delet all double/multi input article
$number_of_live_tests = @live_article;
#print $number_of_live_tests."\n";
my @new_live_article;
my @split_line;
my @split_line_old;
if ($number_of_live_tests > 0) {
my $old_title = '';
my $all_errors_of_this_article = '';
my $i = -1;
foreach (@live_article) {
@split_line_old = @split_line;
@split_line = split(/\t/, $_);
my $current_title = $split_line[0];
$split_line[1] =~ s/\n//;
#print $current_title."\n";
my $number_of_split_line = @split_line;
if ($number_of_split_line != 2) {
print 'Problem with input line:'."\n";
print $_."\n";
die;
};
if ($old_title ne $current_title
and $old_title ne ''){
#save old
$i = $i+1;
$new_live_article[$i] = $old_title."\t".$all_errors_of_this_article;
$all_errors_of_this_article = '';
#print "result:".$new_live_article[$i]."\n";
}
# check new
if ($old_title eq $current_title) {
#double
$all_errors_of_this_article = $all_errors_of_this_article.', '.$split_line[1];
#print 'double: '.$current_title."\t".$all_errors_of_this_article."\n";
} else {
$all_errors_of_this_article = $split_line[1];
#print 'normal: '.$current_title."\t".$all_errors_of_this_article."\n";
}
$old_title = $current_title;
}
#save last
$i = $i+1;
$new_live_article[$i] = $old_title."\t".$all_errors_of_this_article;
@live_article = @new_live_article;
$number_of_live_tests = @live_article;
}
two_column_display('all articles without double:', $number_of_live_tests);
print LOGFILE 'articles without double'."\t".$number_of_live_tests."\n" if (!$starter_modus);
@new_live_article = (); # free memory
@split_line = (); # free memory
#foreach (@live_article) {
# print LOGFILE $_."\n";
#}
#print LOGFILE 'END LIST'."\n\n";
if ($number_of_live_tests == 0) {
# if after this load in live_modus no article found, then end the scan
$quit_program = 'yes';
$quit_reason = $quit_reason. 'no article in scan list for live'."\n";
}
}
}
}
sub article_last_live_scan{
my $file_input_live = $output_directory.$project.'/'.$project.'_'.$error_list_filename;
#print $file_input_live."\n";
open(LIVE, "<$file_input_live");
@live_article = ;
close (LIVE);
$number_of_live_tests = @live_article;
two_column_display('from file articles last scan:', $number_of_live_tests);
print LOGFILE 'articles last scan:'."\t".$number_of_live_tests."\n" if (!$starter_modus);
}
sub new_article{
my $new_counter = 0;
my $limit = $_[0];
# oldest not scanned article
# select distinct title from cw_new where scan_live = 0 and project = 'dewiki' and daytime >= (select daytime from cw_new where scan_live = 0 and project = 'dewiki' order by daytime limit 1) order by daytime limit 250;
my $sql_text = "select distinct title from cw_new where scan_live = 0 and project = '".$project."' and daytime >= (select daytime from cw_new where scan_live = 0 and project = '".$project."' order by daytime limit 1) order by daytime limit ".$limit.";";
my $result = '';
my $sth = $dbh->prepare( $sql_text );
#print ''.$sql_text."
\n";
$sth->execute;
while (my $arrayref = $sth->fetchrow_arrayref()) {
foreach(@$arrayref) {
$result = $_;
}
#print $result."\n";
push(@live_article, $result."\t".'0' );
$new_counter ++;
}
two_column_display('from db articles new:', $new_counter);
print LOGFILE 'articles new:'."\t\t".$new_counter. "\n" if (!$starter_modus);
$for_statistic_new_article = $new_counter;
}
sub last_change_article{
my $change_counter = 0;
my $limit = $_[0];
# oldest not scanned article
# select distinct title from cw_new where scan_live = 0 and project = 'dewiki' and daytime >= (select daytime from cw_new where scan_live = 0 and project = 'dewiki' order by daytime limit 1) order by daytime limit 250;
my $sql_text = "select distinct title from cw_change where scan_live = 0 and project = '".$project."' and daytime >= (select daytime from cw_change where scan_live = 0 and project = '".$project."' order by daytime limit 1) order by daytime limit ".$limit.";";
my $result = '';
my $sth = $dbh->prepare( $sql_text );
#print ''.$sql_text."\n";
$sth->execute;
while (my $arrayref = $sth->fetchrow_arrayref()) {
foreach(@$arrayref) {
$result = $_;
}
#print $result."\n";
push(@live_article, $result."\t".'0' );
$change_counter ++;
}
two_column_display('from db articles changed:', $change_counter);
print LOGFILE 'articles change:'."\t".$change_counter."\n" if (!$starter_modus);
our $for_statistic_last_change_article = $change_counter;
}
sub geo_error_article{
# get all last_change article last days
# Load last change articles
my $file_geo = $project.'_'.$error_geo_list_filename;
my $file_input_geo = $output_geo.$project.'/'.$file_geo;
#print $file_input_new."\n";
my $geo_counter = 0;
if (-e $file_input_geo) {
#if existing
#print 'file exist'."\n";
open(INPUT_GEO, "<$file_input_geo");
do {
my $line = ;
if ($line) {
$line =~ s/\n$//g;
my @split_line = split ( /\t/, $line);
my $number_of_parts = @split_line;
if ( $number_of_parts > 0 ) {
push(@live_article, $split_line[0]."\t".'0' );
$geo_counter ++;
}
}
}
until (eof(INPUT_GEO) == 1);
close (INPUT_GEO);
}
two_column_display('from file articles geo:', $geo_counter);
print ' (no file: '.$file_geo.' )' if not (-e $file_input_geo);
print "\n";
print LOGFILE 'articles geo:'."\t\t".$geo_counter."\n" if (!$starter_modus);
$for_statistic_geo_article = $geo_counter;
}
sub article_with_error_from_dump_scan{
my $database_dump_scan_counter = 0;
my $limit = 250;
# oldest not scanned article
# select distinct title from cw_new where scan_live = 0 and project = 'dewiki' and daytime >= (select daytime from cw_new where scan_live = 0 and project = 'dewiki' order by daytime limit 1) order by daytime limit 250;
my $sql_text = "select distinct title from cw_dumpscan where scan_live = 0 and project = '".$project."' limit ".$limit.";";
my $result = '';
my $sth = $dbh->prepare( $sql_text );
#print ''.$sql_text."\n";
$sth->execute;
while (my $arrayref = $sth->fetchrow_arrayref()) {
foreach(@$arrayref) {
$result = $_;
}
#print $result."\n";
push(@live_article, $result."\t".'0' );
$database_dump_scan_counter ++;
}
two_column_display('from db articles (not scan live):', $database_dump_scan_counter);
#print "\t".$database_dump_scan_counter."\t".'articles from dump (not scan live) from db'."\n";
print LOGFILE 'articles from dump (not scan live) from db:'."\t\t".$database_dump_scan_counter."\n" if (!$starter_modus);
}
sub get_done_article_from_database{
my $database_ok_counter = 0;
my $limit = $_[0];
my $sql_text = " select title from cw_error where ok = 1 and project = '".$project."' limit ".$limit.";";
my $result = '';
my $sth = $dbh->prepare( $sql_text );
#print ''.$sql_text."\n";
$sth->execute;
while (my $arrayref = $sth->fetchrow_arrayref()) {
foreach(@$arrayref) {
$result = $_;
}
#print $result."\n";
push(@live_article, $result."\t".'0' );
$database_ok_counter ++;
}
two_column_display('from db done articles:', $database_ok_counter);
print LOGFILE 'done articles from db:'."\t\t".$database_ok_counter."\n" if (!$starter_modus);
}
sub get_oldest_article_from_database{
my $database_ok_counter = 0;
my $limit = $_[0];
my $sql_text = " select title from cw_error where project = '".$project."' and DATEDIFF(now(),found) > 31 order by DATEDIFF(now(),found) desc limit ".$limit.";";
my $result = '';
my $sth = $dbh->prepare( $sql_text );
#print ''.$sql_text."\n";
$sth->execute;
while (my $arrayref = $sth->fetchrow_arrayref()) {
foreach(@$arrayref) {
$result = $_;
}
#print $result."\n";
push(@live_article, $result."\t".'0' );
$database_ok_counter ++;
}
two_column_display('from db old articles:', $database_ok_counter);
print LOGFILE 'old articles from db:'."\t\t".$database_ok_counter."\n" if (!$starter_modus);
}
############################################################################
sub scan_pages{
# get the text of the next page
print_line();
print 'Start scanning'."\n" if (!$silent_modus);
$end_of_dump = 'no'; # when last article from dump scan then 'yes', else 'no'
$end_of_live = 'no'; # when last article from live scan then 'yes', else 'no'
do {
set_variables_for_article();
if ($dump_or_live eq 'dump' or $dump_or_live eq 'only') {
get_next_page_from_dump();
} else {
get_next_page_from_live();
}
if ( $end_of_dump eq 'no'
and $end_of_live eq 'no'
and not ( $title =~ /\.js$/
or $title =~ /\.css$/
)
)
{
check_article(); #Main check routine
} else {
if ( $end_of_dump eq 'yes'
or $end_of_live eq 'yes' ) {
print 'articles scan finish'."\n\n" if (!$silent_modus);
} else {
print 'no check in article:'."\t\t".$title."\n";
}
}
}
until ( $end_of_dump eq 'yes'
or $end_of_live eq 'yes'
#or $page_number > 20
#or $page_id > 7950
#or ($error_counter > 10000 and $project ne 'dewiki')
#or ($error_counter > 40000)
or ($error_counter > 40000 and $dump_or_live eq 'live')
);
}
sub set_variables_for_article {
$page_number = $page_number + 1;
$title = ''; # title of the current article
$page_id = -1; # page id of the current article
$revision_id = -1; # revision id of the current article
$revision_time = -1; # revision time of the current article
$text = ''; # text of the current article (for work)
$text_origin = ''; # text of the current article origin (for save)
$text_without_comments = ''; # text of the current article without_comments (for save)
$page_namespace = -100; # namespace of page
$page_is_redirect = 'no';
$page_is_disambiguation = 'no';
$page_categories = '';
$page_interwikis = '';
$page_has_error = 'no'; # yes/no error in this page
$page_error_number = -1; # number of all article for this page
undef(@comments); # 0 pos_start
# 1 pos_end
# 2 comment
$comment_counter = -1; #number of comments in this page
undef(@category); # 0 pos_start
# 1 pos_end
# 2 category Test
# 3 linkname Linkname
# 4 original [[Category:Test|Linkname]]
$category_counter = -1;
$category_all = ''; # all categries
undef(@interwiki); # 0 pos_start
# 1 pos_end
# 2 interwiki Test
# 3 linkname Linkname
# 4 original [[de:Test|Linkname]]
# 5 language
$interwiki_counter = -1;
undef(@lines); # text seperated in lines
undef(@headlines); # headlines
undef(@section); # text between headlines
undef(@lines_first_blank); # all lines where the first character is ' '
undef(@templates_all); # all templates
undef(@template); # templates with values
# 0 number of template
# 1 templatename
# 2 template_row
# 3 attribut
# 4 value
$number_of_template_parts = -1; # number of all template parts
undef(@links_all); # all links
undef(@images_all); # all images
undef(@isbn); # all ibsn of books
undef(@ref); # all ref
$page_has_geo_error = 'no'; # yes/no geo error in this page
$page_geo_error_number = -1; # number of all article for this page
}
sub close_file {
#close all open files
close (DUMP);
close (TEMPLATETIGER);
}
sub update_table_cw_error_from_dump {
if ($dump_or_live eq 'dump') {
print 'move all article from cw_dumpscan into cw_error'."\n";
my $sql_text;
my $sth;
$sql_text = "delete /* SLOW_OK */ from cw_error where project = '".$project."';";
$sth = $dbh->prepare( $sql_text );
$sth->execute;
#set @test = 'T%';
#insert into cw_error (select * from cw_dumpscan where project = 'nlwiki' and title like @test);
#delete from cw_dumpscan where project = 'nlwiki' and title like @test;
$sql_text = "insert /* SLOW_OK */ into cw_error (select * from cw_dumpscan where project = '".$project."');";
$sth = $dbh->prepare( $sql_text );
$sth->execute;
print 'delete all article from this project in cw_dumpscan'."\n";
$sql_text = "delete /* SLOW_OK */ from cw_dumpscan where project = '".$project."';";
$sth = $dbh->prepare( $sql_text );
$sth->execute;
}
}
sub delete_deleted_article_from_db {
#delete all deleted article from database
my $sql_text2 = "delete /* SLOW_OK */ from cw_error where ok = 1 and project = '".$project."' and found not like '%".substr(get_time_string(), 0, 7)."%';";
#print $sql_text2."\n";
my $sth = $dbh->prepare( $sql_text2 );
$sth->execute;
}
sub delete_article_from_table_cw_new {
#delete all scanned or older then 7 days from this project
my $sql_text2 = "delete /* SLOW_OK */ from cw_new where project = '".$project."' and (scan_live = 1 or DATEDIFF(now(),daytime) > 7);";
#print $sql_text2."\n";
my $sth = $dbh->prepare( $sql_text2 );
$sth->execute;
#delete all articles from don't scan projects
my $sql_text3 = "delete /* SLOW_OK */ from cw_new where DATEDIFF(now(),daytime) > 8;";
#print $sql_text2."\n";
$sth = $dbh->prepare( $sql_text3 );
$sth->execute;
}
sub delete_article_from_table_cw_change {
#delete all scanned or older then 3 days from this project
my $sql_text2 = "delete /* SLOW_OK */ from cw_change where project = '".$project."' and (scan_live = 1 or DATEDIFF(now(),daytime) > 3);";
#print $sql_text2."\n";
my $sth = $dbh->prepare( $sql_text2 );
$sth->execute;
#delete all articles from don't scan projects
my $sql_text3 = "delete /* SLOW_OK */ from cw_change where DATEDIFF(now(),daytime) > 8;";
$sth = $dbh->prepare( $sql_text3 );
$sth->execute;
}
sub update_table_cw_starter {
if ($starter_modus) {
print 'update_table_cw_starter'."\n" if (!$silent_modus);
#print "\t".$error_counter."\t".'errors found'."\n";
if ($error_counter > 0) {
#print '$page_number= '.$page_number."\n";
my $sql_text = '';
# how much was found
$sql_text = "update cw_starter set errors_done =errors_done +".$error_counter." where project ='".$project."';" if ($load_modus_done) ;
$sql_text = "update cw_starter set errors_new =errors_new +".$error_counter." where project ='".$project."';" if ($load_modus_new) ;
$sql_text = "update cw_starter set errors_dump =errors_dump +".$error_counter." where project ='".$project."';" if ($load_modus_dump) ;
$sql_text = "update cw_starter set errors_change =errors_change +".$error_counter." where project ='".$project."';" if ($load_modus_last_change) ;
$sql_text = "update cw_starter set errors_old =errors_old +".$error_counter." where project ='".$project."';" if ($load_modus_old) ;
#print $sql_text."\n";
my $sth = $dbh->prepare( $sql_text);
$sth->execute;
# for count of current run
$sql_text = "update cw_starter set current_run =".$error_counter." where project ='".$project."';";
#print $sql_text."\n";
$sth = $dbh->prepare( $sql_text);
$sth->execute;
if (!$load_modus_new && $load_modus_last_change) {
# was something change?
$sql_text = "update cw_starter set last_run_change = 'true' where project ='".$project."';";
#print $sql_text."\n";
$sth = $dbh->prepare( $sql_text );
$sth->execute;
}
}
}
}
sub read_and_write_metadata_from_url {
# read the metadata from url (…)
# write this metadata in file for dump and live-scan
#print 'Read metadata from dump and write in file'."\n";
#old from dump
# my $line ='';
# my $end = 'no';
my $metadata = '';
# do {
# $line_number = $line_number + 1;
# $line = ;
# #print $line_number.' '.$line;
# $line =~ s/\n//;
# $metadata = $metadata.$line."\n";
# if (index ($line, '') > -1) {
# $end = 'yes';
# }
#
# }
# until ( $end eq 'yes');
#new from web
# raw_text2
#print 'get Metadaten from :'.$project.' '.$language."\n";
$language = 'nds-nl' if ($project eq 'nds_nlwiki');
###########################
# generate URL of project
###########################
my $url = 'http://'.$language.'.wikipedia.org/w/api.php';
if ($project eq 'commonswiki') {
$url = 'http://commons.wikimedia.org/w/api.php';
}
if ($project =~ /source$/) {
$url = 'http://'.$language.'.wikisource.org/w/api.php';
}
if ($project =~ /wiktionary$/) {
# http://en.wiktionary.org/wiki/Main_page
my $first_url_part = $project;
$first_url_part =~ s/wiktionary$//;
$url = 'http://'.$first_url_part.'.wiktionary.org/w/api.php';
}
if ($project =~ /wikiversity$/) {
# http://fr.wikiversity.org/wiki/Accueil
my $first_url_part = $project;
$first_url_part =~ s/wikiversity$//;
$url = 'http://'.$first_url_part.'.wikiversity.org/w/api.php';
}
print_line();
two_column_display('load metadata from:', $url) ;
$url = $url.'?action=query&meta=siteinfo&siprop=general|namespaces|namespacealiases|statistics|magicwords&format=xml';
$metadata = raw_text2($url);
$language = 'nds_nl' if ($project eq 'nds_nlwiki');
my $file_metadata = $output_directory.$project.'/'.$project.'_metadata.txt';
two_column_display('save metadata into:', $output_directory) ;
two_column_display('', $project.'_metadata.txt') ;
open(METADATA, ">$file_metadata");
print METADATA $metadata;
close(METADATA);
$metadata = '';
}
sub load_metadata_from_file {
# load metadata from file for dump and live
# this file is from the last dump (if live) or current dump (if dump)
#print 'Read metadata from file'."\n";
my $file_metadata = $output_directory.$project.'/'.$project.'_metadata.txt';
open(METADATA, "<$file_metadata");
my @metadata = ;
close(METADATA);
my $metatext = '';
foreach (@metadata) {
$metatext = $metatext.$_;
}
#print $metatext."\n";
#Extract metadata
#sitename
my $sitename = '';
my $pos1 = index($metatext,'sitename="') + length('sitename="');
my $pos2 = index($metatext,'"', $pos1);
$sitename = substr($metatext, $pos1, $pos2 - $pos1);
two_column_display('Sitename:', $sitename) if (!$silent_modus);
#get base
$base = '';
$pos1 = index($metatext,'base="') + length('base="');
$pos2 = index($metatext,'"', $pos1 );
$base = substr($metatext, $pos1, $pos2 -$pos1);
two_column_display('Base:', $base) if (!$silent_modus);
$home = $base;
$home =~ s/[^\/]+$//;
#print 'Home: '."\t\t".$home."\n";
#get namespaces number and name
# for example: 6 Tabulator image
my $namespaces = '';
$pos1 = index($metatext,'') + length('');
$pos2 = index($metatext,'', $pos1);
$namespaces = substr($metatext, $pos1, $pos2 -$pos1);
#print "x".$namespaces."x\n";
#$namespaces =~ s/^\n//g;
$namespaces =~ s/<\/ns>/\n/g;
$namespaces =~ s/\/>/>\n/g; # only namespace 0 - articles
# now every namespase in one line
#print "x".$namespaces."x\n";
$namespaces =~ s/ case="first-letter"//g;
$namespaces =~ s/ xml:space="preserve"//g;
$namespaces =~ s/ subpages=""//g;
#$namespaces =~ s//\t/g;
#$namespaces =~ s/" \/>/\t\n/g;
#$namespaces =~ s/ //g;
#print "x".$namespaces."x\n";
my @namespaces_split = split( /\n/, $namespaces);
$namespaces_count = @namespaces_split;
#print $namespaces_count;
for (my $i = 0; $i < $namespaces_count; $i++) {
#print $i."\t".$namespaces_split[$i]."\n\n";
$namespaces_split[$i] =~ s/[ ]+$//g;
#Spezial
#get id
my $pos1 = index($namespaces_split[$i],'id="') + length('id="');
my $pos2 = index($namespaces_split[$i],'"', $pos1);
my $id = substr($namespaces_split[$i], $pos1, $pos2 -$pos1);
#get canonical namspace name
$pos1 = index($namespaces_split[$i],'canonical="') + length('canonical="');
$pos2 = index($namespaces_split[$i],'"', $pos1);
my $canonical = substr($namespaces_split[$i], $pos1, $pos2 -$pos1);
#get namespace name
$pos1 = index($namespaces_split[$i],'>') + length('>');
my $name = substr($namespaces_split[$i], $pos1);
$namespaces_split[$i] = $id."\t".$canonical."\t".$name;
#print $namespaces_split[$i]."\n";
my @splitter = split( /\t/, $namespaces_split[$i]);
if ( $namespaces_split[$i] =~ /^0/) {
$namespace[$i][0] = 0;
} else {
$namespace[$i][0] = int($splitter[0]);
}
$namespace[$i][1] = $splitter[2];
$namespace[$i][1] = '' if ($namespace[$i][0] == 0);
$namespace[$i][2] = $splitter[1];
$namespace[$i][2] = '' if ($namespace[$i][0] == 0);
if ($namespace[$i][0] == 6) {
# image
$namespace_image[0] = $namespace[$i][1];
$namespace_image[1] = $namespace[$i][2];
}
if ($namespace[$i][0] == 10) {
# templates
$namespace_templates[0] = $namespace[$i][1];
$namespace_templates[1] = $namespace[$i][2] if ($namespace[$i][1] ne $namespace[$i][2]);
}
if ($namespace[$i][0] == 14) {
#category
$namespace_cat[0] = $namespace[$i][1];
$namespace_cat[1] = $namespace[$i][2] if ($namespace[$i][1] ne $namespace[$i][2]);
}
#print $i."\t".$namespace[$i][0]."\t".$namespace[$i][1]."\t".$namespace[$i][1]."\n\n"
}
# namespacealiases
my $namespacealiases_text = '';
$pos1 = index($metatext,'') + length('');
$pos2 = index($metatext,'', $pos1);
$namespacealiases_text = substr($metatext, $pos1, $pos2 -$pos1);
#print $namespacealiases_text. "\n";
$namespacealiases_text =~ s/<\/ns>/\n/g;
$namespacealiases_text =~ s//\t/g;
#print $namespacealiases_text. "\n";
my @namespacealiases_split = split( /\n/, $namespacealiases_text);
$namespacealiases_count = @namespacealiases_split;
#print $namespaces_count;
for (my $i = 0; $i < $namespacealiases_count; $i++) {
my @splitter = split( /\t/, $namespacealiases_split[$i]);
if ($splitter[0] eq '6') {
#aliasname for image
push(@namespace_image, $splitter[1]);
}
if ($splitter[0] eq '10') {
#aliasname for templates
push(@namespace_templates, $splitter[1]);
}
if ($splitter[0] eq '14') {
#aliasname for category
push(@namespace_cat, $splitter[1]);
}
#save all aliases
$namespacealiases[$i][0] = $splitter[0];
$namespacealiases[$i][1] = $splitter[1];
#print 'Namespacealiases: '.$namespacealiases[$i][0].','.$namespacealiases[$i][1]."\n";
}
#foreach (@namespace_image) {
# print $_."\n";
#}
#print "\n";
#foreach (@namespace_cat) {
# print $_."\n";
#}
#magicwords
@magicword_defaultsort = get_magicword($metatext, 'defaultsort');
@magicword_img_thumbnail = get_magicword($metatext, 'img_thumbnail');
@magicword_img_manualthumb = get_magicword($metatext, 'img_manualthumb');
@magicword_img_right = get_magicword($metatext, 'img_right');
@magicword_img_left = get_magicword($metatext, 'img_left');
@magicword_img_none = get_magicword($metatext, 'img_none');
@magicword_img_center = get_magicword($metatext, 'img_center');
@magicword_img_framed = get_magicword($metatext, 'img_framed');
@magicword_img_frameless = get_magicword($metatext, 'img_frameless');
@magicword_img_page = get_magicword($metatext, 'img_page');
@magicword_img_upright = get_magicword($metatext, 'img_upright');
@magicword_img_border = get_magicword($metatext, 'img_border');
@magicword_img_sub = get_magicword($metatext, 'img_sub');
@magicword_img_super = get_magicword($metatext, 'img_super');
@magicword_img_link = get_magicword($metatext, 'img_link');
@magicword_img_alt = get_magicword($metatext, 'img_alt');
@magicword_img_width = get_magicword($metatext, 'img_width');
@magicword_img_baseline = get_magicword($metatext, 'img_baseline');
@magicword_img_top = get_magicword($metatext, 'img_top');
@magicword_img_text_top = get_magicword($metatext, 'img_text_top');
@magicword_img_middle = get_magicword($metatext, 'img_middle');
@magicword_img_bottom = get_magicword($metatext, 'img_bottom');
@magicword_img_text_bottom = get_magicword($metatext, 'img_text_bottom');
#foreach (@magicword_defaultsort) {
# print $_."\n";
#}
#########################
# read statistic data
#########################
# for example pdcwiki (2013-02-15)
#
my $statistic_text =
$pos1 = index($metatext,'', $pos1);
$statistic_text = substr($metatext, $pos1, $pos2 -$pos1);
my @statistic = split(/ /,$statistic_text);
foreach (@statistic) {
if ($_ =~ /^pages/) {
$statistic_online_page = $_;
$statistic_online_page =~ s/pages=//g;
$statistic_online_page =~ s/"//g;
$statistic_online_page =~ s/ //g;
two_column_display('pages online:', $statistic_online_page);
}
}
}
sub get_magicword {
my $metatext = $_[0];
my $key = $_[1];
my @result;
my $pos1 = index( $metatext, '', $part );
shift (@part_split);
foreach (@part_split) {
#print $_."\n"
my $pos3 = index ($_, '');
my $alias = substr ($_, 0, $pos3);
#print $alias ."\n";
push (@result, $alias );
}
return(@result);
}
}
sub get_next_page_from_dump{
#this function scan line after line from dump,
#the result is the text from the next article
my $line = ""; # one line in dump
my $article_complete = 0; # all line of article (then 1)
my $start_recording = 0; # find
my $revision_start = 0; # find
#loop for every line
do {
$line = ;
$line_number = $line_number +1;
#$number_of_scan_line = $number_of_scan_line +1; #Security, maybe the finish is not correct
#print "$line";
if ($line =~ //) {
$start_recording = 1;
}
if ($start_recording == 1) {
$text = $text.$line;
}
if ($line =~ /<\/page>/) {
$start_recording = 0;
$article_complete = 1;
}
if ($line =~ //) {
#extract title
$title ="$line";
my @content= split(/>/,$title);
@content= split(/,$content[1]);
$title=$content[0];
#print "$title\n";
}
if ($line =~ // and $page_id == -1 ) {
#extract id
$page_id ="$line";
my @content= split(/>/,$page_id);
@content= split(/,$content[1]);
$page_id = $content[0];
#print "$page_id\t$title\n";
}
if ($line =~ //) {
$revision_start = 1;
}
if ($revision_start == 1 and $revision_id == -1 and $line =~ //) {
#read revision_id
$revision_id ="$line";
my @content= split(/>/,$revision_id);
@content= split(/,$content[1]);
$revision_id=$content[0];
#print $revision_id,"\n";
}
if ($revision_start == 1 and $line =~ //) {
#read revision_id
$revision_time ="$line";
my @content= split(/>/,$revision_time);
@content= split(/,$content[1]);
$revision_time=$content[0];
#print $revision_time,"\n";
}
$end_of_dump = 'yes' if ($line =~ /<\/mediawiki>/);
$end_of_dump = 'yes' if (eof(DUMP) == 1);
}
until ( $article_complete == 1 or $end_of_dump eq 'yes');
#Extract only edit-text
my $test = index ($text, '');
$text = substr($text, $test);
$text =~ s///g;
$test = index($text, '');
$text = substr($text,0,$test);
$text = replace_special_letters($text);
#if ( $title eq 'At-Tabarī'
# or $title eq 'Rumänien'
# or $title eq 'Liste der Ortsteile im Saarland') {
# my $output_article_text_file = $output_directory.$project.'/'.$project.'_text_article_'.$title.'.txt';
# open(OUTPUT_ARTICLE_TEXT, ">$output_article_text_file");
# print OUTPUT_ARTICLE_TEXT $text;
# close(OUTPUT_ARTICLE_TEXT);
#}
#print $text;
}
sub get_next_page_from_live {
$current_live_article ++; #next article
if ( $current_live_error_scan != 0 ) {
# Error not 0 (new aricles, and last changes...)
if ($current_live_error_scan != 0 and $current_live_article == $maximum_current_error_scan) {
# set number higher if not all 50 errors found
#print 'Nr.'.$current_live_error_scan."\n";
#print 'Found at moment :'.$error_description[$current_live_error_scan][3]."\n";
#print 'Max allowed:'.$max_error_count."\n";
#print 'Max possible:'.$number_article_live_to_scan."\n";
if ( $error_description[$current_live_error_scan][3] < $max_error_count ) {
# set higer maximum
$maximum_current_error_scan = $maximum_current_error_scan + ($max_error_count - $error_description[$current_live_error_scan][3]);
#print 'Set higher maximum: '.$maximum_current_error_scan."\n";
} else {
# stop scan
save_errors_for_next_scan($current_live_article);
#$rest_of_errors_not_scan_yet
$current_live_article = -1;
}
}
# find next error with articles
if (($current_live_error_scan > 0 and $current_live_article == -1)
or $current_live_article == $number_article_live_to_scan
or $current_live_error_scan == -1) {
#print 'switch from error to error'."\n";
$current_live_error_scan = 0 if ($current_live_error_scan == -1); #start with error 1
do {
$current_live_error_scan ++;
#print $current_live_error_scan."\n";
@live_to_scan = ();
if ($error_description[$current_live_error_scan][3] < $max_error_count) {
# only if not all found with new/change/last
get_all_error_with_number($current_live_error_scan);
} else {
# if with new /change etc. we found for this error much
get_all_error_with_number($current_live_error_scan);
save_errors_for_next_scan(0);
@live_to_scan = ();
}
$number_article_live_to_scan = @live_to_scan;
} until ($current_live_error_scan >= $number_of_error_description
or $number_article_live_to_scan > 0);
$maximum_current_error_scan = $max_error_count;
if ($error_description[$current_live_error_scan][3] > 0) {
#print 'More errors for error'.$current_live_error_scan."\n";
#print 'At moment only :'.$error_description[$current_live_error_scan][3]."\n";
$maximum_current_error_scan = $max_error_count - $error_description[$current_live_error_scan][3];
#print 'Search now for more :'.$maximum_current_error_scan."\n";
}
$current_live_article = 0;
$xml_text_from_api = '';
#print '#############################################################'."\n";
#print 'Error '.$current_live_error_scan.' :'."\t".$number_article_live_to_scan."\n" if ($number_article_live_to_scan > 0);
#print 'Max='.$maximum_current_error_scan."\n";
#print 'Available = '.$number_article_live_to_scan."\n";
}
}
if ( $current_live_error_scan == 0
and $current_live_article >= $number_article_live_to_scan ) {
# end of live, no more article to scan
$end_of_live = 'yes';
}
if ($current_live_error_scan >= $number_of_error_description) {
# after check live all errors, then start with check of error 0 (new articles, last changes, ...)
$current_live_article = 0;
$xml_text_from_api = '';
$current_live_error_scan = 0;
get_all_error_with_number($current_live_error_scan);
$number_article_live_to_scan = @live_to_scan;
#print 'Error 0 :'."\t".$number_article_live_to_scan."\n";
$maximum_current_error_scan = $max_error_count;
}
#$number_article_live_to_scan = @live_to_scan;
if ( $current_live_article < $number_article_live_to_scan
and $number_article_live_to_scan > 0
and $end_of_live ne 'yes' ) {
# there is an error with articles
# now we get the next article
if ($xml_text_from_api eq '') {
# if list of xml_text_from_api is empty, then load next ariticles
#print 'Load next texts from API'."\n";
my $many_titles = '';
my $i = $current_live_article;
my $end_many_title = 'false';
do {
my $line = $live_to_scan[$i];
my @line_split = split( /\t/, $line);
my $next_title = $line_split[0];
print LOGFILE $next_title."\n" if (!$starter_modus);
$next_title = replace_special_letters($next_title);
$many_titles = $many_titles.'|'.uri_escape($next_title);
$many_titles =~ s/^\|//;
$i++;
$end_many_title = 'true' if ($i == $number_article_live_to_scan);
$end_many_title = 'true' if ($i == $current_live_article + 25); # not more then 25 articles
$end_many_title = 'true' if ( length($many_titles) > 2000); # url length not too long (Problem ruwiki and other no latin letters )
}
until ($end_many_title eq 'true');
#print 'Many titles ='.$many_titles."\n";
$xml_text_from_api = raw_text_more_articles( $many_titles );
$xml_text_from_api =~ s/^<\?xml version="1\.0"\?>//;
$xml_text_from_api =~ s/^//;
$xml_text_from_api =~ s/^//;
$xml_text_from_api =~ s/^//;
$xml_text_from_api =~ s/<\/api>$//;
$xml_text_from_api =~ s/<\/query>$//;
$xml_text_from_api =~ s/<\/pages>$//;
#print $xml_text_from_api."\n";
}
# get next title and text from xml_text_from_api
if ($xml_text_from_api ne '') {
my $pos_end = index ($xml_text_from_api, '' );
if ($pos_end > -1 ) {
# normal page
$text = substr ( $xml_text_from_api, 0, $pos_end + length('') );
$xml_text_from_api = substr ( $xml_text_from_api, $pos_end + length('') );
} else {
# missing page
#
#print 'Missing Page'."\n";
$pos_end = index ($xml_text_from_api, 'missing="" />' );
$text = substr ( $xml_text_from_api, 0, $pos_end + length('missing="" />') );;
$xml_text_from_api = substr ( $xml_text_from_api, $pos_end + length('missing="" />') );
if ($pos_end == -1){
#BIG PROBLEM
print 'WARNING: Big problem with API'."\n";
print LOGFILE 'WARNING: Big problem with API'."\n" if (!$starter_modus);
$text = '';
$xml_text_from_api = '';
}
}
my $line = $live_to_scan[$current_live_article];
my @line_split = split( /\t/, $line);
$title = $line_split[0];
#print $title ."\n";
#print substr ( $text, 0, 150)."\n";
if (index ( $text, 'title='.'"'.$title.'"') == -1 ) {
# the result from the api is in a other sort
# know get the current title
# for example
#print "Old title:".$title ."\n";
my $pos_title = index ($text, 'title="');
my $title_text = $text;
$title_text = substr ( $title_text, $pos_title + length ('title="') );
$pos_title = index ($title_text, '"');
$title = substr ( $title_text, 0, $pos_title );
#print "New title:".$title;
#print "\n\n";
#print substr ( $text, 0, 150)."\n";
#print "\n\n";
}
#print $title."\n";
push(@article_was_scanned, $title);
# get id
my $test_id_pos = index ($text, 'pageid="');
if ($test_id_pos > -1) {
$page_id = substr($text, $test_id_pos + length( 'pageid="') );
$test_id_pos = index ($page_id , '"');
$page_id = substr($page_id, 0, $test_id_pos);
#print $page_id.' - '.$title."\n";
}
# get text
my $test = index ($text, '', $test );
$text = substr($text, $pos + 2);
#$text =~ s///g;
$test = index($text,'');
$text = substr($text,0,$test);
}
#revision_id
#revision_time
#print $text."\n";
#print substr($text, 0, 60)."\n";
$text = replace_special_letters($text);
}
}
}
sub save_errors_for_next_scan {
my $from_number = $_[0];
$number_article_live_to_scan = @live_to_scan;
for (my $i = $from_number; $i < $number_article_live_to_scan; $i++) {
#print $live_to_scan[$i]."\n";
my $line = $live_to_scan[$i];
#print '1:'.$line."\n";
my @line_split = split( /\t/, $line);
my $rest_title = $line_split[0];
$rest_of_errors_not_scan_yet = $rest_of_errors_not_scan_yet."\n".$rest_title."\t".$current_live_error_scan;
}
}
sub get_all_error_with_number {
# get from array "live_article" with all errors, only this errors with error number X
my $error_live = $_[0];
#print 'Error number: '.$error_live."\n";
my $number_of_article = @live_article;
#print $number_of_article."\n";
#print $live_article[0]."\n";
if ($number_of_article > 0) {
for (my $i = 0; $i < $number_of_article; $i ++) {
my $current_live_line = $live_article[$i];
#print $current_live_line."\n";
my @line_split = split( /\t/, $current_live_line);
#print 'alle:'.$line_split[1]."\n" if ($error_live == 0);
my @split_error = split( ', ',$line_split[1]);
my $found = 'no';
foreach (@split_error) {
if ( $error_live eq $_ ){
#found error with number X
$found = 'yes';
#print $current_live_line."\n" if ($error_live == 0);
}
}
if ($found eq 'yes') {
# article has error X
#print 'found '.$current_live_line."\n" if ($error_live == 7);
# was this article scanned today ?
$found = 'no';
my $number_of_scanned_articles = @article_was_scanned;
#print 'Scanned: '."\t".$number_of_scanned_articles."\n";
foreach (@article_was_scanned) {
#print $_."\n";
if ( index ($current_live_line, $_."\t") == 0) {
#article was in this run scanned
$found = 'yes';
#print 'Was scanned :'."\t".$current_live_line."\n";
}
}
if ($found eq 'no') {
push(@live_to_scan, $current_live_line); #."\t".$i
}
}
}
}
}
sub get_all_error_with_type {
# at the moment not in use
# get from all error, only this errors with number X
my $error_type = $_[0];
my $number_of_article = @live_article;
for (my $i = 0; $i < $number_of_article; $i ++) {
my $current_live_line = $live_article[$i];
my @line_split = split( /\t/, $current_live_line);
if ( $line_split[1] eq $error_type) {
# $live_article[$i] =~ s/\tD\t/\tL\t/;
# $live_article[$i] =~ s/\tO\t/\tL\t/;
push(@live_to_scan, $current_live_line); #."\t".$i
}
}
}
sub replace_special_letters {
my $content = $_[0];
# only in dump must replace not in live
# http://de.wikipedia.org/w/index.php?title=Benutzer_Diskussion:Stefan_K%C3%BChn&oldid=48573921#Dump
$content =~ s/<//g;
$content =~ s/"/"/g;
$content =~ s/'/'/g;
$content =~ s/&/&/g;
# < -> <
# > -> >
# " -> "
# ' -> '
# & -> &
return ($content);
}
sub raw_text {
my $title = $_[0];
$title =~ s/&/%26/g; # Problem with & in title
$title =~ s/'/'/g; # Problem with apostroph in title
$title =~ s/<//g;
$title =~ s/"/"/g;
# http://localhost/~daniel/WikiSense/WikiProxy.php?wiki=$lang.wikipedia.org&title=$article
my $url2 = '';
#$url2 = 'http://localhost/~daniel/WikiSense/WikiProxy.php?wiki=de.wikipedia.org&title='.$title;
$url2 = $home;
$url2 =~ s/\/wiki\//\/w\//;
# old $url2 = $url2.'index.php?title='.$title.'&action=raw';
$url2 = $url2.'api.php?action=query&prop=revisions&titles='.$title.'&rvprop=timestamp|content&format=xml';
#print $url2."\n";
my $response2 ;
#do {
uri_escape($url2);
#print $url2."\n";
#uri_escape( join ' ' => @ARGV );
my $ua2 = LWP::UserAgent->new;
$response2 = $ua2->get( $url2 );
#}
#until ($response2->is_success);
my $content2 = $response2->content;
my $result2 = '';
$result2 = $content2 if ($content2) ;
return($result2);
}
sub raw_text2 {
my $url = $_[0];
$url =~ s/&/%26/g; # Problem with & in title
$url =~ s/'/'/g; # Problem with apostroph in title
my $response2 ;
uri_escape($url);
my $ua2 = LWP::UserAgent->new;
$response2 = $ua2->get( $url );
my $content2 = $response2->content;
my $result2 = '';
$result2 = $content2 if ($content2) ;
return($result2);
}
sub raw_text_more_articles {
my $title = $_[0];
#$title =~ s/&/%26/g; # Problem with & in title
#$title =~ s/'/'/g; # Problem with apostroph in title
#$title =~ s/<//g;
#$title =~ s/"/"/g;
#$title =~ s/'/'/g;
my $url2 = '';
$url2 = $home;
$url2 =~ s/\/wiki\//\/w\//;
$url2 = $url2.'api.php?action=query&prop=revisions&titles='.$title.'&rvprop=timestamp|content&format=xml';
print LOGFILE $url2."\n" if (!$starter_modus);
my $response2 ;
my $ua2 = LWP::UserAgent->new;
$response2 = $ua2->get( $url2 );
my $content2 = $response2->content;
my $result2 = '';
$result2 = $content2 if ($content2) ;
return($result2);
}
####################################
sub load_text_translation{
# Input of translation page
$translation_page = 'Wikipedia:WikiProject Check Wikipedia/Translation' if ($project eq 'afwiki') ;
$translation_page = 'ويكيبيديا:فحص_ويكيبيديا/ترجمة' if ($project eq 'arwiki') ;
$translation_page = 'Viquipèdia:WikiProject Check Wikipedia/Translation' if ($project eq 'cawiki') ;
$translation_page = 'Wikipedie:WikiProjekt Check Wikipedia/Translation' if ($project eq 'cswiki') ;
$translation_page = 'Commons:WikiProject Check Wikipedia/Translation' if ($project eq 'commonswiki') ;
$translation_page = 'Wicipedia:WikiProject Check Wikipedia/Translation' if ($project eq 'cywiki') ;
$translation_page = 'Wikipedia:WikiProjekt Check Wikipedia/Oversættelse' if ($project eq 'dawiki') ;
$translation_page = 'Wikipedia:WikiProjekt Syntaxkorrektur/Übersetzung' if ($project eq 'dewiki') ;
$translation_page = 'Wikipedia:WikiProjekt Syntaxkorrektur/Übersetzung' if ($project eq 'dewiki_test') ;
$translation_page = 'Wikipedia:WikiProject Check Wikipedia/Translation' if ($project eq 'enwiki') ;
$translation_page = 'Projekto:Kontrolu Vikipedion/Tradukado' if ($project eq 'eowiki') ;
$translation_page = 'Wikiproyecto:Check Wikipedia/Translation' if ($project eq 'eswiki') ;
$translation_page = 'Wikipedia:Wikiprojekti Check Wikipedia/Translation' if ($project eq 'fiwiki') ;
$translation_page = 'Projet:Correction syntaxique/Traduction' if ($project eq 'frwiki') ;
$translation_page = 'Meidogger:Stefan Kühn/WikiProject Check Wikipedia/Translation' if ($project eq 'fywiki') ;
$translation_page = 'Wikipedia:WikiProject Check Wikipedia/Translation' if ($project eq 'hewiki') ;
$translation_page = 'Wikipédia:Ellenőrzőműhely/Fordítás' if ($project eq 'huwiki') ;
$translation_page = 'Wikipedia:ProyekWiki Cek Wikipedia/Terjemahan' if ($project eq 'idwiki') ;
$translation_page = 'Wikipedia:WikiProject Check Wikipedia/Translation' if ($project eq 'iswiki') ;
$translation_page = 'Wikipedia:WikiProjekt Check Wikipedia/Translation' if ($project eq 'itwiki') ;
$translation_page = 'プロジェクト:ウィキ文法のチェック/Translation' if ($project eq 'jawiki') ;
$translation_page = 'Vicipaedia:WikiProject Check Wikipedia/Translation' if ($project eq 'lawiki') ;
$translation_page = 'Wikipedia:Wikiproject Check Wikipedia/Translation' if ($project eq 'ndswiki') ;
$translation_page = 'Wikipedie:WikiProject Check Wikipedia/Translation' if ($project eq 'nds_nlwiki') ;
$translation_page = 'Wikipedia:Wikiproject/Check Wikipedia/Vertaling' if ($project eq 'nlwiki') ;
$translation_page = 'Wikipedia:WikiProject Check Wikipedia/Translation' if ($project eq 'nowiki') ;
$translation_page = 'Wikipedia:WikiProject Check Wikipedia/Translation' if ($project eq 'pdcwiki') ;
$translation_page = 'Wikiprojekt:Check Wikipedia/Tłumaczenie' if ($project eq 'plwiki') ;
$translation_page = 'Wikipedia:Projetos/Check Wikipedia/Tradução' if ($project eq 'ptwiki') ;
$translation_page = 'Википедия:Страницы с ошибками в викитексте/Перевод' if ($project eq 'ruwiki') ;
$translation_page = 'Wikipedia:WikiProject Check Wikipedia/Translation' if ($project eq 'rowiki') ;
$translation_page = 'Wikipédia:WikiProjekt Check Wikipedia/Translation' if ($project eq 'skwiki') ;
$translation_page = 'Wikipedia:Projekt wikifiering/Syntaxfel/Translation' if ($project eq 'svwiki') ;
$translation_page = 'Vikipedi:Vikipedi proje kontrolü/Çeviri' if ($project eq 'trwiki') ;
$translation_page = 'Вікіпедія:Проект:Check Wikipedia/Translation' if ($project eq 'ukwiki') ;
$translation_page = 'װיקיפּעדיע:קאנטראלירן_בלעטער/Translation' if ($project eq 'yiwiki') ;
$translation_page = '维基百科:错误检查专题/翻译' if ($project eq 'zhwiki') ;
two_column_display('load translation of:', $translation_page) if (!$silent_modus);
my $translation_input = raw_text($translation_page);
$translation_input = replace_special_letters($translation_input);
#print $translation_input."\n";
#die;
my $input_text ='';
# start_text
$input_text = get_translation_text($translation_input, 'start_text_'.$project.'=', 'END');
$start_text = $input_text if ($input_text ne '');
# description_text
$input_text = get_translation_text($translation_input, 'description_text_'.$project.'=', 'END');
$description_text = $input_text if ($input_text ne '');
# category_text
$input_text = get_translation_text($translation_input, 'category_001=', 'END' );
$category_text = $input_text if ($input_text ne '');
# priority
$input_text = get_translation_text($translation_input, 'top_priority_'.$project.'=', 'END' );
$top_priority_project = $input_text if ($input_text ne '');
$input_text = get_translation_text($translation_input, 'middle_priority_'.$project.'=', 'END' );
$middle_priority_project = $input_text if ($input_text ne '');
$input_text = get_translation_text($translation_input, 'lowest_priority_'.$project.'=', 'END' );
$lowest_priority_project = $input_text if ($input_text ne '');
# find error description
for (my $i = 1; $i < $number_of_error_description; $i++) {
my $current_error_number = 'error_';
$current_error_number = $current_error_number.'0' if ($i < 10);
$current_error_number = $current_error_number.'0' if ($i < 100);
$current_error_number = $current_error_number.$i;
#print $i, $current_error_number."\n";
# Priority
$error_description[$i][4] = get_translation_text($translation_input, $current_error_number.'_prio_'.$project.'=', 'END');
#print "x".$error_description[$i][4]."x"."\n";
if ($error_description[$i][4] ne '') {
# if a translation was found
$error_description[$i][4] = int ($error_description[$i][4]);
} else {
# if no translation was found
$error_description[$i][4] = $error_description[$i][0];
}
if ($error_description[$i][4] == -1 ) {
# in project unkown then use prio from script
$error_description[$i][4] = $error_description[$i][0];
}
#print $i."\t".$error_description[$i][0]."\t".$error_description[$i][4]."\n";
$error_description[$i][5] = get_translation_text($translation_input, $current_error_number.'_head_'.$project.'=', 'END');
$error_description[$i][6] = get_translation_text($translation_input, $current_error_number.'_desc_'.$project.'=', 'END');
#$error_description[$i][9] = get_translation_text_XHTML($error_description[$i][5]); # don't work
#$error_description[$i][10] = get_translation_text_XHTML($error_description[$i][6]); # don't work
}
}
sub get_translation_text {
my $translation_text = $_[0];
my $start_tag = $_[1];
my $end_tag =$_[2];
my $pos_1 = index($translation_text, $start_tag);
my $pos_2 = index($translation_text, $end_tag, $pos_1);
my $result = '';
if ($pos_1 > -1 and $pos_2 > 0) {
$result = substr($translation_text, $pos_1, $pos_2 -$pos_1);
#print $result."\n";
$result = substr($result, index ($result, '=')+1);
$result =~ s/^ //g;
$result =~ s/ $//g;
}
return ($result);
}
sub get_translation_text_XHTML{
# don't work today
# use Wikipedia-API to get XHTML from Wikitext
# http://www.mediawiki.org/wiki/API:Parsing_wikitext#parse
# http://en.wikipedia.org/w/api.php?action=parse&text=%5B%5Bfoo%5D%5D%20%5B%5BAPI:Query|bar%5D%5D%20%5Bhttp://www.example.com/%20baz%5D
my $translation_text = $_[0];
my $xhtml_text = '';
print 'Translation='.$translation_text."\n";
if ($translation_text ne '') {
my $url = '';
$url = $home;
$url =~ s/\/wiki\//\/w\//;
$url = $url.'api.php?action=parse&text='.$translation_text;
print 'URL='.$url."\n";
my $response ;
my $ua = LWP::UserAgent->new;
$response = $ua->get( $url );
my $content = $response->content;
$xhtml_text = $content if ($content) ;
# only text, delete all other
my $pos = index($xhtml_text, 'text xml:space=');
$xhtml_text = substr ($xhtml_text ,$pos);
$pos = index($xhtml_text, '')+length('');
$xhtml_text = substr ($xhtml_text ,$pos);
$pos = index($xhtml_text, '></text>');
$xhtml_text = substr ($xhtml_text ,0, $pos);
$pos = index($xhtml_text, '/g;
$xhtml_text =~ s/"/>/g;
$xhtml_text =~ s/&/&/g;
$xhtml_text =~ s/<//g;
$xhtml_text =~ s/"/>/g;
#$xhtml_text =~ s/"/"/g;
#$xhtml_text =~ s/'/'/g;
}
print 'XHTML='.$xhtml_text ."\n";
return ($xhtml_text);
}
sub output_errors_desc_in_db{
if ($load_modus_done and $dump_or_live eq 'live') {
two_column_display('Update descripton in DB:', 'insert new and update old error description') if (!$silent_modus);
# mysql> desc cw_error_desc;
# +-----------------+---------------+------+-----+---------+-------+
# | Field | Type | Null | Key | Default | Extra |
# +-----------------+---------------+------+-----+---------+-------+
# | project | varchar(100) | YES | | NULL | |
# | id | int(8) | YES | | NULL | |
# | prio | int(4) | YES | | NULL | |
# | name | varchar(255) | YES | | NULL | |
# | text | varchar(4000) | YES | | NULL | |
# | name_html | varchar(255) | YES | | NULL | |
# | text_html | varchar(4000) | YES | | NULL | |
# | name_wiki_trans | varchar(255) | YES | | NULL | |
# | text_wiki_trans | varchar(4000) | YES | | NULL | |
# | name_html_trans | varchar(255) | YES | | NULL | |
# | text_html_trans | varchar(4000) | YES | | NULL | |
# +-----------------+---------------+------+-----+---------+-------+
for (my $i = 1; $i < $number_of_error_description; $i++) {
my $sql_headline = $error_description[$i][1];
$sql_headline =~ s/'/\\'/g;
my $sql_desc = $error_description[$i][2];
$sql_desc =~ s/'/\\'/g;
$sql_desc = substr( $sql_desc, 0, 3999); # max 4000
my $sql_headline_trans = $error_description[$i][5];
$sql_headline_trans =~ s/'/\\'/g;
my $sql_desc_trans = $error_description[$i][6];
$sql_desc_trans =~ s/'/\\'/g;
$sql_desc = substr( $sql_desc_trans, 0, 3999); # max 4000
# insert or update error
my $sql_text2 = "update cw_error_desc
set prio=".$error_description[$i][4].",
name='".$sql_headline."' ,
text='".$sql_desc."',
name_trans='".$sql_headline_trans."' ,
text_trans='".$sql_desc_trans."'
where id = ". $i."
and project = '". $project."'
;";
#print $sql_text2."\n" if ($i == 18 or $i ==67 or $i ==91);
my $sth = $dbh->prepare( $sql_text2 );
my $x = $sth->execute;
if ( $x eq '1') {
#print 'Update '.$x.' rows'."\n";
} else {
two_column_display('new error:', 'description insert into db');
$sql_text2 = "insert into cw_error_desc (project, id, prio, name, text, name_trans, text_trans)
values ('". $project."', ". $i.", ".$error_description[$i][4].", '".$sql_headline."' ,'".$sql_desc."',
'".$sql_headline_trans."' ,'".$sql_desc_trans."' );";
# print $sql_text2."\n";
$sth = $dbh->prepare( $sql_text2 );
$sth->execute;
}
}
}
}
sub output_text_translation_wiki{
# Output of translation-file
my $filename = $output_directory.$project.'/'.$project.'_'.$translation_file;
two_column_display('Output translation text to:', $project.'_'.$translation_file) if (!$silent_modus);
open(TRANSLATION, ">$filename");
#######################################
print TRANSLATION '
'."\n";
close(TRANSLATION);
}
sub output_little_statistic{
print 'errors found:'."\t\t".$error_counter." (+1)\n";
}
sub output_duration {
$time_end = time();
my $duration = $time_end - $time_start;
my $duration_minutes = int($duration / 60);
my $duration_secounds = int(((int(100 * ($duration / 60)) / 100)-$duration_minutes)*60);
print 'Duration:'."\t\t".$duration_minutes.' minutes '.$duration_secounds.' secounds'."\n";
print $project.' '.$dump_or_live."\n" if (!$silent_modus);
}
#############################################################################
sub check_article{
my $steps = 1;
$steps = 1 if ($dump_or_live eq 'live');
$steps = 5000 if ($silent_modus eq 'silent');
if ( $title eq 'At-Tabarī'
or $title eq 'Rumänien'
or $title eq 'Liste der Ortsteile im Saarland') {
# $details_for_page = 'yes';
}
my $text_for_tests = "Hallo
Barnaby, Wendy. The Plague Makers: The Secret World of Biological Warfare, Frog Ltd, 1999.
in en [[Japanese war crimes]]
{{DEFAULTSORT:Role-playing game}}
=== Test ===
ISBN 1-883319-85-4 ISBN 0-7567-5698-7 ISBN 0-8264-1258-0 ISBN 0-8264-1415-X
* Tulku - ISBN 978 90 04 12766 0 (wrong ISBN)
:-sdfsdf[[http://www.wikipedia.org Wikipedia]] chack tererh
:#sadf
ISBN 3-8304-1007-7 ok
ISBN 3-00-016815-X ok
ISBN 978-0-8330-3930-9 ok
ISBN3-00-016815-X
[[Category:abc]] and [[Category:Abc]]ä
[[1911 př. n. l.|1911]]–[[1897 př. n. l.|1897]] př. n. l.
Rodné jméno = <--M17-Y5:N35-G17-F4:X1-->
Trůnní jméno = M23-L2-<--N5:S12-D28*D28:D28--> Ö 124345
===This is a headline with reference A reference with '''bold''' text===
Nubkaure
-V28-V31:N35-G17-C10-
Jméno obou paní = ü-G16-V28-V31:N35-G17-C10-
[[Image:logo|thumb| < small> sdfsdf]]
Abu XY
im text ISBN 3-8304-1007-7 im text <-- ok
im text ISBN 3-00-016815-X im text ok
im text ISBN 978-0-8330-3930-9 im text ok
[[Image:logo|thumb| Part < small> Part2 Part2]]
[[Image:logo|thumb| Part < small> Part]]
ISBN-10 3-8304-1007-7 bad
ISBN-10: 3-8304-1007-7 bad
ISBN-13 978-0-8330-3930-9 bad
ISBN-13: 978-0-8330-3930-9 -->bad
Abu XY
ISBN 123451678XXXX bad
ISBN 123456789x ok
ISBN 3-00-0168X5-X bad
*ISBN 3-8304-1007-7 121 Test ok
*ISBN 3-8304-1007-7121 Test bad
*ISBN 3 8304 1007 7 121 Test ok
*ISBN 978-0-8330-39 309 Test ok
*ISBN 9 7 8 0 8 3 3 0 3 9 3 0 9 Test bad 10 ok 13
[http://www.dehoniane.it/edb/cat_dettaglio.php?ISBN=24109] bad
{{test|ISBN=3 8304 1007 7 121 |test=[[text]]}} bad
[https://www5.cbonline.nl/pls/apexcop/f?p=130:1010:401581703141772 ISBN-bureau] bad
ISBN 3-8304-1007-7
<\br>
[[:hu:A Gibb fivérek által írt dalok listája]] Big Problem
[[en:Supermann]]
testx
=== Liste ===
test
=== 1Acte au sens d'''instrumentum'' ===
=== 2Acte au sens d'''instrumentum''' ===
== 3Acte au sens d''instrumentum'' ==
ISBN 978-88-10-24109-7
* ISBN 0-691-11532-X ok
* ISBN 123451678XXXX bad
* ISBN-10 1234567890 bad
* ISBN-10: 1234567890 bad
* ISBN-13 1234567890123 bad
* ISBN-13: 1234567890123 bad
* ISBN 123456789x Test ok
* ISBN 123456789x x12 Test
* ISBN 123456789012x Test
* ISBN 1234567890 12x Test
* ISBN 123456789X 123 Test
* ISBN 1 2 3 4 5 6 7 8 9 0 Test
[http://www.dehoniane.it/edb/cat_dettaglio.php?ISBN=24109]
[https://www5.cbonline.nl/pls/apexcop/f?p=130:1010:401581703141772 ISBN-bureau]
* Tramlijn_Ede_-_Wageningen - ISBN-nummer
* Tulku - ISBN 978 90 04 12766 0 (wrong ISBN)
* Michel_Schooyans - [http://www.dehoniane.it/edb/cat_dettaglio.php?ISBN=24109]
*VARA_gezinsencyclopedie - [https://www5.cbonline.nl/pls/apexcop/f?p=130:1010:401581703141772 ISBN-bureau]
Testtext hat einen [[|Link]], der nirgendwo hinführt.Kees Heitink en Gert Jan Koster, De tram rijdt weer!: Bennekomse tramgeschiedenis 1882 - 1937 - 1968 - 2008, 28 bladzijden, geen ISBN-nummer, uitverkocht..
=== 4Acte au sens d''instrumentum'' ===
[[abszolútérték-függvény||]] ''f''(''x'') – ''f''(''y'') [[abszolútérték-függvény||]] aus huwiki
* [[Antwerpen (stad)|Antwerpen]] heeft na de succesvolle organisatie van de Eurogames XI in [[2007]] voorstellen gedaan om editie IX van de Gay Games in [[2014]] of eventueel de 3e editie van de World OutGames in [[2013]] naar Antwerpen te halen. Het zogeheten '[[bidbook]]' is ingediend en het is afwachten op mogelijke toewijzing door de internationale organisaties.
*a[[B:abc]]
*bas addf< br>
*casfdasdf< br >
*das fdasdf< br / >
[[Chełmno]] and
sdfsf ISBN 3434462236
95-98. ISBN 0 7876 5784 0. .
=== UNO MANDAT ===
0-13-110370-9
* [http://www.research.att.com/~bs/3rd.html The C++ Programming Language]: [[Bjarne Stroustrup]], special ed., Addison-Weslye, ISBN 0-201-70073-5, 2000
* The C++ Standard, Incorporating Technical Corrigendum 1, BS ISO/IEC 14882:2003 (2nd ed.), John Wiley & Sons, ISBN 0-470-84674-7
* [[Brian Kernighan|Brian W. Kernighan]], [[Dennis Ritchie|Dennis M. Ritchie]]: ''[[The C Programming Language]]'', Second Edition, Prentice-Hall, ISBN 0-13-110370-9 1988
* [http://kmdec.fjfi.cvut.cz/~virius/publ-list.html#CPP Programování v C++]: Miroslav Virius, [http://www.cvut.cz/cs/uz/ctn Vydavatelství ČVUT], druhé vydání, ISBN 80-01-02978-6 2004
* Naučte se C++ za 21 dní: Jesse Liberty, [http://www.cpress.cz/ Computer Press], ISBN 80-7226-774-4, 2002
* Programovací jazyk C++ pro zelenáče: Petr Šaloun, [http://www.neo.cz Neokortex] s.r.o., ISBN 80-86330-18-4, 2005
* Rozumíme C++: Andrew Koenig, Barbara E. Moo, [http://www.cpress.cz/ Computer Press], ISBN 80-7226-656-X, 2003
* [http://gama.fsv.cvut.cz/~cepek/uvodc++/uvodc++-2004-09-11.pdf Úvod do C++]: Prof. Ing. Aleš Čepek, CSc., Vydavatelství ČVUT, 2004
*eaa[[abc]]< br / >
sdfsdf .
Verlag LANGEWIESCHE, ISBN-10: 3784551912 und ISBN-13: 9783784551913
=== Meine Überschrift ABN === ISBN 1234-X-1234
*fdd asaddf…
{{Zitat|Der Globus ist schön. Buch 27}}
{{Zitat|Madera=1000 Buch 27|Kolumbus{{Höhe|name=123}}|kirche=4 }}
==== Саларианцы ====
[[Breslau]] ([[Wrocław]])
*gffasfdasdf<\br7>
{{Testvorlage|name=heeft na de succesvolle organisatie van de [[Eurogames XIa|Eurogames XI]] inheeft na de succesvolle organisatie van de Eurogames XI inheeft na de succesvolle organisatie van de Eurogames XI in123 |ott]o=kao}}
*hgasfda sdf sdfsdf2!
===== PPM, PGM, PBM, PNM =====
===== PPM, PGM, PBM, PNM =====
" .'test –uberlappung3456Ende des Text';
# $text = $text_for_tests;
get_namespace();
print_article_title_every_x( $steps );
delete_old_errors_in_db();
get_comments_nowiki_pre();
get_math();
get_source();
get_code();
get_syntaxhighlight();
get_isbn();
get_templates();
get_links();
get_images();
get_tables();
get_gallery();
get_hiero(); #problem with <-- and --> (error 056)
get_ref();
check_for_redirect();
get_categories();
get_interwikis();
create_line_array();
get_line_first_blank();
get_headlines();
error_check();
#get_coordinates() if (-e $file_module_coordinate) ;
#get_persondata();
set_article_as_scan_live_in_db($title, $page_id) if ($dump_or_live eq 'live');
}
sub print_article_title_every_x{
#print in the Loop every x article a short message
#Output every x articles
my $steps =$_[0];
#print "$page_number \t$title\n";
my $x = int( $page_number / $steps ) * $steps ;
my $counter_output = '';
my $project_output = $project;
$project_output =~ s/wiki$//;
#$counter_output .= $project_output.' ';
#$counter_output .= 'p='.$page_number.' ';
#$statistic_online_page
#$counter_output .= ' id='.$page_id.' ';
#$counter_output .= $title."\n";
if ( $page_number == 1 or $page_number == $x ) {
#print $counter_output;
my $percent = int($page_number/$statistic_online_page*100).'%';
if ($dump_or_live eq 'live') {
my $output_current_live_article = $current_live_article + 1;
$percent = $output_current_live_article.'/'.$number_article_live_to_scan;
}
printf "%-3s %-8s %-5s %-8s %-40s\n", $project_output, 'p='.$page_number, $percent, 'id='.$page_id, $title;
}
print LOGFILE $counter_output if (!$starter_modus);
}
sub delete_old_errors_in_db{
# delete article in database
#print $page_id."\t".$title."\n";
if ( $dump_or_live eq 'live'
and $page_id
and $title ne '' ) {
my $sql_text = "delete /* SLOW_OK */ from cw_error where error_id = ". $page_id." and project = '". $project."';";
#print $sql_text."\n\n";
my $sth = $dbh->prepare( $sql_text );
$sth->execute;
}
}
sub get_namespace{
# check the namespace of an article
# if here is an error then maybe it is a new namespace in this project; show sub load_metadata_from_file
if ( index( $title, ':' ) > -1) {
#print 'Get namespace for: '.$title."\n";
for (my $i = 0; $i < $namespaces_count; $i++) {
#print $i." ".$namespace[$i][0]." ".$namespace[$i][1]." ".$namespace[$i][2] ."\n" ;#if ($title eq 'Sjabloon:Gemeente');
$page_namespace = $namespace[$i][0] if ( index ($title, $namespace[$i][1].':') == 0);
$page_namespace = $namespace[$i][0] if ( index ($title, $namespace[$i][2].':') == 0);
}
#print $page_namespace."\n" ;#if ($title eq 'Sjabloon:Gemeente');
#print $namespacealiases_count."\n";
for (my $i = 0; $i < $namespacealiases_count; $i++) {
#print $i." ".$namespacealiases[$i][0]." ".$namespacealiases[$i][1] ."\n" ;#if ($title eq 'Sjabloon:Gemeente');
$page_namespace = $namespacealiases[$i][0] if ( index ($title, $namespacealiases[$i][1].':') == 0);
}
#print $page_namespace."\n" ;#if ($title eq 'Sjabloon:Gemeente');
$page_namespace = 0 if ($page_namespace == -100);
} else {
$page_namespace = 0;
}
}
sub get_comments_nowiki_pre{
my $last_pos = -1;
my $pos_comment = -1;
my $pos_nowiki = -1;
my $pos_pre = -1;
my $pos_first = -1;
my $loop_again = 0;
do {
# next tag
$pos_comment = index ($text, '', $pos_comment + length('
$last_pos = get_next_comment($pos_comment + $last_pos);
$loop_again = 1;
#print 'comment'.' '.$pos_comment.' '.$last_pos."\n";
}
if ($tag_first eq 'comment' and $pos_comment_end == -1) {
#found
$last_pos = $pos_comment +1;
$loop_again = 1;
#print 'comment no end'."\n";
my $text_output = substr( $text, $pos_comment);
$text_output = text_reduce($text_output, 80);
error_005_Comment_no_correct_end ('check', $text_output );
#print $text_output."\n";
}
#nowiki
if ($tag_first eq 'nowiki' and $pos_nowiki_end > -1) {
# found and
$last_pos = get_next_nowiki($pos_nowiki + $last_pos);
$loop_again = 1;
#print 'nowiki'.' '.$pos_nowiki.' '.$last_pos."\n";
}
if ($tag_first eq 'nowiki' and $pos_nowiki_end == -1) {
# found and no
$last_pos = $pos_nowiki +1;
$loop_again = 1;
#print 'nowiki no end'."\n";
my $text_output = substr( $text,$pos_nowiki);
$text_output = text_reduce($text_output, 80);
error_023_nowiki_no_correct_end('check', $text_output );
}
#pre
if ($tag_first eq 'pre' and $pos_pre_end > -1) {
# found
and
$last_pos = get_next_pre($pos_pre + $last_pos);
$loop_again = 1;
#print 'pre'.' '.$pos_pre.' '.$last_pos."\n";
}
if ($tag_first eq 'pre' and $pos_pre_end == -1) {
# found
and no
#print $last_pos.' '.$pos_pre."\n";
$last_pos = $pos_pre +1;
$loop_again = 1;
#print 'pre no end'."\n";
my $text_output = substr( $text,$pos_pre);
$text_output = text_reduce($text_output, 80);
error_024_pre_no_correct_end ('check', $text_output);
}
#end
if ($pos_comment == -1
and $pos_nowiki == -1
and $pos_pre == -1) {
# found no ', $pos_start + length('');
$comment_counter = $comment_counter +1;
$comments[$comment_counter][0] = $pos_start;
$comments[$comment_counter][1] = $pos_end;
$comments[$comment_counter][2] = substr($text, $pos_start, $pos_end - $pos_start );
#print $comments[$comment_counter][2]."\n";
#replace comment with space
my $text_before = substr( $text, 0, $pos_start );
my $text_after = substr( $text, $pos_end );
my $filler = '';
for (my $i = 0; $i < ($pos_end-$pos_start); $i++) {
$filler = $filler.' ';
}
$text = $text_before.$filler.$text_after;
$result = $pos_end;
}
return ($result );
}
sub get_math {
my $pos_start_old = 0;
my $pos_end_old = 0;
my $end_search = 'yes';
do {
my $pos_start = 0;
my $pos_end = 0;
$end_search = 'yes';
#get position of next