#!/usr/bin/perl

#
# Recluse 0.1 - The Evil Webspider - Written by powerpork
#
# Takes a httpd host as param and prints found hosttypes
#
# It's my first ever perl script! jipeeeh! so be gentle
# it's probably riddled with bugs, memory leaks and other
# harsh uglyness i'd rather not mention.
#

use IO::Socket;

our $usage =
         "[ Recluse v0.1 - Written by powerpork ]\n"
       . "\n"
       . "  Usage:\n"
       . "  ./recluse.pl <host name> <document path>\n"
       . "\n"
       . "  Example:\n"
       . "  ./recluse.pl www.somesite.com /documents/index.html\n"
       . "\n";

our $reqhosttype =
               "HEAD / HTTP/1.0\r\n"
             . "Accept: text/html\r\n"
             . "User-Agent: mozilla\r\n"
             . "\r\n";


# used these as registers and are therefore declared globally
# bpc: bottom (of) primaries   counter
# bsc: bottom (of) secondaries counter
# ipc: index  (of) primaries   counter
# isc: index  (of) secondaries counter

our $bpc=0, our $bsc=0, our $ipc=0, our $isc=0;

sub p {
  print("place $_[0]\n");
}

sub tohttpreq {
    return "GET $_[0] HTTP/1.0\r\n\r\n";
}

sub httpreq
{
    my $sock = new IO::Socket::INET(
	 PeerAddr => $_[0],
	 PeerPort => 80,
	 Proto => 'tcp',
    );

  return 0  unless $sock;
  $sock->autoflush(1);
  chomp;

  pipe(r, w);
  w->autoflush(1);
  $pid = fork();

  if(!$pid){
    close r;
    local $SIG{ALRM} = 
      sub {
	print w 0;
	close w;
	waitpid($pid, 0);
	exit(0);
      };

    alarm 5;
    print $sock $_[1];
    @msg = <$sock>;
    alarm 0;
    print w @msg;
    close w;
    waitpid($pid,0);
    exit(0);
  }else{
    die "cannot fork: $!" unless defined $pid;
    close w;
    @msg = <r>;
    close r;
    if(@msg){
      return @msg;
    }else{
      return 0;
    }
  }
}

sub filterservertype
{
    my $entry, $servertype;
    for $entry(@_){
	$entry =~ s/\r\n//;
	if($entry =~ /Server: /){
	    $entry =~ s/Server: //;
	    $servertype = $entry;
	}
    }
    return $servertype;
}

sub extractlinks
{
    my @links = (), $entry = (), $c=0;
    for $entry(@_){
	while($entry =~ /[hH][rR][eE][fF][=]["']/){
       	    ($null, $entry, $null) = split(/[hH][rR][eE][fF][=]["']/, $entry);
            ($entry, $null, $null) = split(/["']/, $entry);
            if(($entry =~ /\.htm[l]$/i || $entry =~ /\/$/) && ($entry !~ / /)){
	      $links[$c] = $entry;
	      $c++;
	    }
        }
    }

    splice(@links, $c);
    return @links;
}

sub splithostdoc
{
    my ($host, $doc);
    ($null, $host, $null) = split(/http:\/\//, $_[1]);
    ($host, $null, $null) = split(/\/+/, $host);
    ($null, $null, $null) = split(/$host/, $_[1]);
    $doc = $_[1];

    return ($host, $doc);
}

sub PlaceEntry
{
  my $server = $_[0];
  my $doc = $_[1];
  if( !(($server eq "0")||($server eq "")) && !(IsDoneServer($server)) ){
	printf("[ $server ]:{ %s }\n", filterservertype(httpreq($server, $reqhosttype)));
	AddPrimEntry($server, $doc);
    }elsif ( !(($server eq "0")||($server eq "")||($server =~ / / )) ){
	AddSecEntry($server, $doc);
    }
}

sub IsDoneServer
{
    for $server2(@prim_server_stack){
	if($server2 eq $_[0]){
	    return 1;
	}
    }
    return 0;
}

sub DelPrimEntry
{
    splice(@prim_doc_stack, $_[0], 1);
    splice(@prim_server_stack, $_[0], 1);
    if($_[0] < $ipc){
      $ipc--;
    }
    $bpc--;
}

sub AddPrimEntry
  {
    $prim_server_stack[$bpc] = $_[0];
    if(!($_[1] eq "")){
      $prim_doc_stack[$bpc] = $_[1];
    }else{
      $prim_doc_stack[$bpc] = "/";
    }
    $bpc++;
  }


# I hate repeating myself but it makes
# the code so much more readable

sub DecapPrim
  {
    splice(@prim_doc_stack, $_[0]);
    splice(@prim_server_stack, $_[0]);
  }

sub DelSecEntry
{
    splice(@sec_doc_stack, $_[0], 1);
    splice(@sec_server_stack, $_[0], 1);
    if($_[0] < $isc){
      $isc--;
    }
    $bsc--;
}

sub AddSecEntry
  {
    $sec_server_stack[$bsc] = $_[0];
    if(!($_[1] eq "")){
      $sec_doc_stack[$bsc] = $_[1];
    }else{
      $sec_doc_stack[$bsc] = "/";
    }
    if(($isc > 50) || ($bsc-$isc > 100)){
      TrimSec();
    }

    $bsc++;

  }

sub DecapSec
  {
    splice(@sec_doc_stack, $_[0]);
    splice(@sec_server_stack, $_[0]);
  }

sub ProcEntry
  {
    #p(9);
    local @rlinks;

    @rlinks = extractlinks(  httpreq( $_[0], tohttpreq($_[1]) )  );

 $c=0;
    for $blub(@rlinks){
      $rlinks[$c] = unipath($_[0], $_[1], $rlinks[$c]);
      ($server, $doc) = splithostdoc($_[0], $rlinks[$c]);
      PlaceEntry($server, $doc);
      $c++;
    }
  }

sub unipath # $server, $sourcedoc, $doc
  {
    if($_[2] =~ /^http:\/\//i){
      return $_[2];
    }elsif(($_[2] !~/^\//) && ($_[1] !~/\/$/)){
      my @list = split("/", $_[1]);
      for $ding(@list){ $last = $ding }
      (my $path, $null, $null) = split($last, $_[1]);
      return $path . $_[2];
    }elsif($_[2] !~/^\//){
      return $_[1] . $_[2];
    }else{
      return "http://" . $_[0] . $_[2];
    }
}

sub TrimSec
  {
    my $boundry=50;

    if(($bpc-$ipc)>$boundry){
      @sec_server_stack = ();
      @sec_doc_stack = ();
      $bsc = 0;
      $isc = 0;
    }else{
      my @swaplist_docs = ();
      my @swaplist_servers = ();

      if(($bsc - $isc) > $boundry){
	for($c=0;$c<50;$c++){
	  $swaplist_servers[$c] = $sec_server_stack[$bsc-$boundry+$c];
	  $swaplist_docs[$c] = $sec_doc_stack[$bsc-$boundry+$c];
	  $isc = 0;
	  $bsc = $boundry;
	}
      }else{
	for($c=0;($c+$isc)<$bsc;$c++){
	  $swaplist_servers[$c] = $sec_server_stack[$isc+$c];
	  $swaplist_docs[$c] = $sec_doc_stack[$isc+$c];
	}
	$isc = 0;
	$bsc = $c+1;
      }

      @sec_server_stack = ();
      @sec_doc_stack = ();
      @sec_server_stack = @swaplist_servers;
      @sec_doc_stack = @swaplist_docs;
    }
}

sub DoPrim
  {

    while($ipc < $bpc){
      ProcEntry($prim_server_stack[$ipc], $prim_doc_stack[$ipc]);
      $ipc++;
    }
  }

sub DoSec
  {
    while($isc < $bsc){
      ProcEntry($sec_server_stack[$isc], $sec_doc_stack[$isc]);
      $isc++;
      DoPrim();
    }
  }

sub StartDo
  {
    DoPrim();
    DoSec();
  }


#-----------------------

if(($ARGV[0] eq "") || ($ARGV[1] eq "")){
  print($usage);
  exit(1);
}

ProcEntry($ARGV[0], unipath($ARGV[0], "http://$ARGV[0]/", $ARGV[1]));
StartDo();


#---------------------





