Jump to content

Manual:Chris G's botclasses/DownloadAllImagesBot.php

From mediawiki.org

This bot uses Chris G's botclasses to download all images from a wiki.

<?php
/* DownloadAllImagesBot.php
 * By Leucosticte, https://www.mediawiki.org/wiki/User:Leucosticte
 * GNU Public License 2.0
 *
 * This bot downloads all images from a wiki.
 */

/* Setup my classes. */
include( 'botclasses.php' );
$wiki      = new wikipedia;
$wiki->url = "http://en.wikipedia.org/w/api.php";

/* All the login stuff. */
$user = 'REMOVED';
$pass = 'REMOVED';
$wiki->login( $user, $pass );

$dir = "./downloadfiles";
// Create directory if it doesn't exist
if ( !file_exists( $dir ) ) {
    echo "Creating directory $dir...\n";
    mkdir ( $dir );
}
if ( !is_dir( $dir ) ) {
    die ( "$dir is not a directory\n" );
}

// $done = false means that there still are more images left to come
$done = false;

// Initialize the cURL session
$ch = curl_init();

// This corresponds to the API:AllPages parameter "aifrom" which tells it with what page
// title at which to start listing image titles.
$aifrom = '';

// Keep going until it's evident that there are no more images
while ( !$done ) {
        // Start preparing an API query to tell the API: Put the list in PHP format; get 500 image titles
        // at a time; get the urls for the images; sort the list in ascending order.
        $query = "?action=query&format=php&list=allimages&ailimit=500&aiprop=url&aidir=ascending";
        if ( $aifrom ) {
                $query .= "&aifrom=$aifrom";
        }
        // Get the result of the API query.
        $ret = $wiki->query ( $query );
        // If the result doesn't tell us at what page title to start our next query, then that means this
        // is the end of the images.
        if ( !isset ( $ret['query-continue'] ) ) {
                $done = true;
        } else {
                // The result array has two parts, query and query-continue; this second part tells us where to
                // pick up where we left off
                $aifrom = $ret['query-continue']['allimages']['aicontinue'];
        }
        // Loop through that array of 500 image urls and download them all
        foreach ( $ret['query']['allimages'] as $element ) {
                // Save images in the directory
                $filename = "$dir/" . $element['name'];
                // If the file already exists, don't save it again
                if ( !file_exists ( $filename ) ) {
                        $f = fopen($filename, "w");
                        // For information on what this does, see
                        // https://www.php.net/function.curl-setopt
                        curl_setopt($ch, CURLOPT_URL, $element['url']);
                        curl_setopt($ch, CURLOPT_HEADER, 0);
                        curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE );
                        $g = curl_exec( $ch );
                        fwrite($f, $g);
                        fclose($f);
                }
        }
}