#!/usr/bin/perl
#
# pixpirate v0.25  (c) ajax@mobis.com, October 1998
#
# 
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
#
#############
#
# goto a source url, download all other sub-url's from an index
# of url's, then goto each of those source url's, downloading every jpg
# file that it comes into contact with.
# 
# Ultimately this script would be ran as a crontab every 24 hours,
# looking for new url's.  It will compare all sub-url's with an index of 
# retrieved sub-url's and each image has its md5 signature saved and compared
# with an index of other md5 signatures, making redundancy minimal.
# Duplicate filenames are renamed with leading random numbers.
# 
# todo: replace md5 logging with pack() and unpack() to reduce size of md5 log
#
# Here are some sample web sites you can use to plug into $site_url
#
# http://www.purextc.com/amateur/amat.htm
# http://www.purextc.com/gay/lesbian.htm
# http://www.blastsite.com/main.shtml
# http://www.adultbuffet.com/tgp/tgp.htm
# http://www.purextc.com/
# http://www.ebonyporn.com/main.html
# http://www.pornno.com/habit/gallerypost.html
# http://www.youngnympho.com/main.html
# http://www.book-mark.net/mark.html
# http://ww3.voyeurweb.com/main/Picturep.html

# CONFIGURATION OPTIONS (change these)
$site_url = "http://www.purextc.com/"; 
$dwnld_dir = "/home/ajax/public_html/pics";
$dupindexfile = "/home/ajax/bin/pixpirate.done";
$debug = 0;    # debug mode, 1=on, 0=off

# MD5 CONFIG OPTIONS (change these)
$md5_checking = 1;     # 0=off, 1=on
$md5sum = "/usr/bin/md5sum";   # location of md5sum binary
$md5_checksum_list = "/home/ajax/bin/pixpirate.md5";  

umask 022;       # set the umask so files will be -rwxr-xr-x (chmod 755)
$version = "v0.25";
$SIG{ALRM} = \&timed_out;
$timeout_seconds = 30;
chop($yymmdd = `date +%y%m%d`);
srand;                                          # initialize rand function
$jpg = ".jpg";

#############
## M A I N ##
#############

&display_banner;                                # show startup banner
if ($#ARGV < 0) {
   print "Using URL: $site_url\n";
   $site_url = $ARGV[0]; 
  } else {
   print "Using (param) URL: $ARGV[0]\n";
   $site_url = $ARGV[0]; }
#&get_options;
&setup_working_dir;                             # setup working directory

 @new_parse_sites = `lynx -source $site_url |egrep -i http| egrep -i href`;
#@new_parse_sites = `cat www.purextc.com |egrep -i http|egrep -i href |egrep '\
<li\>'`;



print "Loading URL: $site_url\n";

##############################################################################
##
## FIRST LOOP: Parse each $full_http_address line and pull out the http
##             address only.  Then download each http web link into an
##             array @jpg_index. 
##
##############################################################################

foreach $full_http_address (@new_parse_sites) {

  chop $full_http_address;
  #$full_http_address =~ s/.+href=\"([\w\d\/\W]+)\"\>.*$/$1/i;
  $full_http_address =~ s/^.+href=\"(http:.+)\"[\>\s].*$/$1/i; #pull out the ur
l
  $full_http_address =~ s/\"[\s\>].+$//ig; # remove ending data after '">'
  $full_http_address =~ s/href=\"//ig; # remove 'href="' throughout line
  $full_http_address =~ s/\"$//g;     # remove '"' at end of lines
  $full_http_address =~ s/\&/\\\&/g;  # replace '&' with '\&'
  $full_http_address =~ s/\?/\\\?/g;  # replace '?' with '\?'
  $full_http_address =~ s/[\>\<]//g; # remove all '>' and '<' characters
  $full_http_address =~ s/^.+(http.*$)/$1/;   # remove any crap at the beginnin
g


  # Check for duplicate http address line
  $check_for_duplicate = `egrep \'$full_http_address\' $dupindexfile`;
  if ($check_for_duplicate =~ /\w/) {
    print "   Already Downloaded: $full_http_address\n";
  } else {

    print "Loading Sub-Url: $full_http_address\n"; 
    @jpg_index = `lynx -source $full_http_address |egrep -i jpg|egrep -i href`;

    ##########################################################################
    ##
    ## SECOND LOOP: Download each $full_http_address from the first loop
    ##              put it into @jpg_index.  Parse this for all lines with
    ##              a jpeg file in it.  Then download those files in the 
    ##              third loop.
    ##
    ##########################################################################
    foreach $jpeg_line (@jpg_index) {

       # pull out everything between the href="...." double-quotes.
       $jpeg_line =~ /href=\"([\w\d_-]+\.jpg)\"/i;
       $jpeg_url = $1;
       
       if ($jpeg_url =~ /jpg/i) {  #if we found something, move on.
        if ($debug) {print "DEBUG: (110) \$jpeg_url = $jpeg_url\n"; }
        $using_rand = 0;      # are we using a random filename now? NO.

        if ($jpeg_url =~ /\/$/i)  {       # does the last char end in a "/"
           $jpeg_url =~ /([\w\d]+\.jpg)/i;
           $jpgfile = $1;
           if ($debug) {print "DEBUG: (200) \$jpgfile (parse of \$jpeg_url) = $
jpgfile\n"; }
           if ( -e "$dwnld_dir/$yymmdd/$jpgfile" ) {
              $randnum = rand 1000000;
              $randnum =~ s/^(\d+)\..*$/$1/i;
              print "   Downloading: $jpeg_url\n";
              $jpgfile =~ s/\.jpg//i;
              print "   File Exists, Saving as: $dwnld_dir/$yymmdd/$jpgfile$ran
dnum$jpg\n";
              alarm ($timeout_seconds);
              $working_filename = "$dwnld_dir/$yymmdd/$jpgfile_$randnum.jpg";
              system("lynx -source $jpeg_url > $dwnld_dir/$yymmdd/$jpgfile$rand
num$jpg");
              alarm (0);
              if ((-s "$dwnld_dir/$yymmdd/$jpgfile$randnum$jpg") < 5000) {
                 print "   File Size less than 5000 bytes.  $dwnld_dir/$yymmdd/
$jpgfile$randnum$jpg REMOVED.\n";
                 alarm (10);
                 system("rm $dwnld_dir/$yymmdd/$jpgfile$randnum$jpg");
                 alarm (0);
              }
              $using_rand = 1;     # We are using a random filename now.
           } else {
              print "   Downloading: $jpeg_url\n";
              alarm($timeout_seconds);
              $working_filename = "$dwnld_dir/$yymmdd/$jpgfile";
              system("lynx -source $jpeg_url > $dwnld_dir/$yymmdd/$jpgfile");
              alarm(0);
              if ((-s "$dwnld_dir/$yymmdd/$jpgfile") < 5000) {
                 print "   File Size less than 5000 bytes.  $dwnld_dir/$yymmdd/
$jpgfile REMOVED.\n";
                 alarm(10);
                 system("rm $dwnld_dir/$yymmdd/$jpgfile");
                 alarm(0);
              }
           }
           if ($md5_checking eq 1) { &md5_check; }
        } elsif ($jpeg_url =~ /\.jpg/i)  {  # does the file name have .jpg in i
t 
            if ($full_http_address =~ /\.s?htm/i) { 
              $full_http_address =~ /(http.+)\/.+$/i; 
              $http_stem = $1;
              $jpgfile = $jpeg_url;
              if ($debug) {print "DEBUG: (210) \$full_http_address = $full_http
_address\n"; }
            } elsif 
              ($full_http_address =~ /.\/$/) { 
              #$full_http_address =~ /(http.+)\/.+\/$/i; 
              $http_stem = $full_http_address;
              $jpgfile = $jpeg_url;
              if ($debug) {print "DEBUG: (220) \$full_http_address = $full_http
_address\n"; }
            } else {
              $full_http_address =~ /(http.+)\/.+\.jpg.+$/i;
              $http_stem = $1;
              $jpgfile = $jpeg_url;
              if ($debug) {print "DEBUG: (225) \$full_http_address = $full_http
_address\n"; }
            }
            if ($debug) {print "DEBUG: (230) \$http_stem = $http_stem\n";}
            if ($debug) {print "DEBUG: (240) \$jpgfile = $jpgfile\n"; }
            if ( -e "$dwnld_dir/$yymmdd/$jpgfile" ) { 
               $randnum = rand 1000000;
               $randnum =~ s/^(\d+)\..*$/$1/i;
               $jpgfile =~ s/\.jpg//i;
               print "   Downloading: $http_stem/$jpgfile\n";
               print "   File Exists, Saving as: $dwnld_dir/$yymmdd/$jpgfile$ra
ndnum$jpg\n";
               alarm($timeout_seconds);
               $working_filename = "$dwnld_dir/$yymmdd/$jpgfile$randnum$jpg";
               system("lynx -source $http_stem/$jpeg_url > $dwnld_dir/$yymmdd/$
jpgfile$randnum$jpg");
               alarm(0);
               if ((-s "$dwnld_dir/$yymmdd/$jpgfile$randnum$jpg") < 5000) {
                  print "   File Size less than 5000 bytes.  $dwnld_dir/$yymmdd
/$jpgfile$randnum$jpg REMOVED.\n";
                  alarm(10);
                  system("rm $dwnld_dir/$yymmdd/$jpgfile$randnum$jpg");
                  alarm(0);
               }
               $using_rand = 1;     # We are using random numbered filenames.
            } else {
               print "   Downloading: $http_stem/$jpgfile\n";
               alarm($timeout_seconds);
               $working_filename = "$dwnld_dir/$yymmdd/$jpgfile";
               system("lynx -source $http_stem/$jpeg_url > $dwnld_dir/$yymmdd/$
jpgfile");
               alarm(0);
               if ((-s "$dwnld_dir/$yymmdd/$jpgfile") < 5000) {
                  print "   File Size less than 5000 bytes.  $dwnld_dir/$yymmdd
/$jpgfile REMOVED.\n";
                  alarm(10);
                  system("rm $dwnld_dir/$yymmdd/$jpgfile");
                  alarm(0);
               }
            }
               if ($md5_checking eq 1) { &md5_check; }
        } 
       } #end if
    } #end foreach $jpgline 
    open (INDEXFILE,">> $dupindexfile");
    print INDEXFILE "$full_http_address\n";
    close (INDEXFILE);
  }


} #end foreach @new_parse_sites



##
## S U B R O U T I N E S 
##


sub display_banner {

   print "+--------------------------------------------------------------------
-+\n";
   print "| pixpirate $version (c) ajax\@mobis.com                     October 
1998 |\n";
   print "+--------------------------------------------------------------------
-+\n";
}

sub setup_working_dir {
 if ( -e "$dwnld_dir/$yymmdd" ) {
   print "Using Directory: $dwnld_dir/$yymmdd\n";
 } else {
   print "Creating Directory: $dwnld_dir/$yymmdd\n";
   system("mkdir $dwnld_dir/$yymmdd");
 }
} # end setup_working_dir subroutine


sub md5_check {
#
# COMPARE MD5 CHECKSUM
#
 if ($using_rand eq 1) {
  if (-e "$dwnld_dir/$yymmdd/$jpgfile$randnum$jpg") {
   alarm(10);
   chop($md5_fingerprint = `$md5sum $dwnld_dir/$yymmdd/$jpgfile$randnum$jpg`);
   alarm(0);
   $md5_fingerprint =~ s/^([\w\d]+)\s.+$/$1/i;
   alarm(10);
   $vrfyMD5 = `egrep $md5_fingerprint $md5_checksum_list`;
   alarm(0);
   if ($vrfyMD5 =~ /\w/) {
    print "   Duplicate file: $dwnld_dir/$yymmdd/$jpgfile$randnum$jpg REMOVED.\
n";
    alarm(10);
    system ("rm -f $dwnld_dir/$yymmdd/$jpgfile$randnum$jpg");
    alarm(0);
   } else {
    open (CHECKSUMFILE,">> $md5_checksum_list");
    print CHECKSUMFILE "$md5_fingerprint\n";
    close (CHECKSUMFILE); 
   } 
   $using_rand = 0;  # Reset this value back to Zero
  }

 } else {
  if (-e "$dwnld_dir/$yymmdd/$jpgfile") {
   alarm(10);
   chop($md5_fingerprint = `$md5sum $dwnld_dir/$yymmdd/$jpgfile`);
   alarm(0);
   alarm(10);
   chop($vrfyMD5 = `egrep $md5_fingerprint $md5_checksum_list`);
   alarm(0);
   if ($vrfyMD5 =~ /\w/) {
    print "   Duplicate file: $dwnld_dir/$yymmdd/$jpgfile REMOVED.\n";
    alarm(10); 
    system ("rm -f $dwnld_dir/$yymmdd/$jpgfile");
    alarm(0);
   } else {
    open (CHECKSUMFILE,">> $md5_checksum_list");
    print CHECKSUMFILE "$md5_fingerprint\n";
    close (CHECKSUMFILE);
   }
  } #end if -e jpgfile
 } # end else

} # end md5_check subroutine



sub timed_out {
  print "Operation Timed Out: $error\n";
  system("rm $working_filename");
}

