Don't forget to update the rootPath on line 2 and the yourSite MediaWiki namespaces on lines 49 - 51.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 | <?php $rootPath = "C:/something/UniServerZ/www/wikimigration/articles/"; $xmlDumpFilename = "backup-dump.xml"; $file = file_get_contents($rootPath.$xmlDumpFilename); $pagecount = 0; $skipped = 0; while(strpos($file, "<page>") !== FALSE){ $pagecount++; //extract one page = one article $pageStartPos = strpos($file, "<page>"); $pageEndPos = strpos($file, "</page>"); $page = substr($file, $pageStartPos, $pageEndPos+7 - $pageStartPos); //the title of the article $titleStartPos = strpos($page, "<title>"); $titleEndPos = strpos($page, "</title>"); $title = substr($page, $titleStartPos+7, $titleEndPos - ($titleStartPos+7)); //overwrite illegal filename characters to get a legal filename $filename = str_replace("<","__",$title); $filename = str_replace(">","__",$filename); $filename = str_replace(":","__",$filename); $filename = str_replace('"',"__",$filename); $filename = str_replace("/","__",$filename); $filename = str_replace("\\","__",$filename); $filename = str_replace("|","__",$filename); $filename = str_replace("?","__",$filename); $filename = str_replace("*","__",$filename); //the actual textual content of the article $textStartPos = strpos($page, "<text"); $textStartPos = strpos($page, ">", $textStartPos) + 1; $textEndPos = strpos($page, "</text>"); $text = substr($page, $textStartPos, $textEndPos-$textStartPos); //new content = (original) title + content $newContent = "<h1>$title</h1> $text"; $filename = str_replace("&","&",$filename); if(strpos($filename, 'User__') !== false or strpos($filename, 'Category__') !== false or strpos($filename, 'Talk__') !== false or strpos($filename, 'Module__') !== false or strpos($filename, 'MediaWiki__') !== false or strpos($filename, 'yourSite_talk__') !== false or strpos($filename, 'yourSite talk__') !== false or strpos($filename, 'yourSite__') !== false or strpos($filename, 'User_talk__') !== false or strpos($filename, 'File_talk__') !== false or strpos($filename, 'File__') !== false){ echo "skipping file $filename\r\n"; $skipped++; } //write page to file elseif(file_put_contents($rootPath."articlesSplit/$filename.html", $newContent) === false) echo "issue with article $title\r\n"; //continue with the dump-file $file = substr($file, $pageEndPos+7); } echo "$pagecount pages processed, $skipped skipped (in unrequired namespaces)"; ?> |
No comments:
Post a Comment