head	1.13;
access;
symbols
	rel-2-4-3a:1.13
	rel-2-4-3:1.13
	rel-2-4-2:1.13
	rel-2-4-1:1.13
	rel-2-4-0-patches:1.13.0.2
	rel-2-4-0:1.13
	rel-2-2-3:1.9
	rel-2-2-2:1.9
	rel-2-2-1-merged:1.9
	rel-2-2-1:1.9
	rel-2-2-patches:1.9.0.2
	rel-2-2a:1.9
	rel-2-2:1.9
	rel2-2rc1:1.8
	start:1.1.1.1
	vendor:1.1.1;
locks; strict;
comment	@# @;


1.13
date	2003.08.30.05.02.55;	author whmoseley;	state Exp;
branches;
next	1.12;

1.12
date	2003.07.30.05.58.19;	author whmoseley;	state Exp;
branches;
next	1.11;

1.11
date	2003.04.14.16.51.08;	author whmoseley;	state Exp;
branches;
next	1.10;

1.10
date	2002.12.06.01.49.42;	author augur;	state Exp;
branches;
next	1.9;

1.9
date	2002.09.09.07.15.19;	author whmoseley;	state Exp;
branches;
next	1.8;

1.8
date	2002.08.15.05.31.02;	author whmoseley;	state Exp;
branches;
next	1.7;

1.7
date	2002.08.15.03.20.45;	author augur;	state Exp;
branches;
next	1.6;

1.6
date	2002.08.14.22.08.48;	author whmoseley;	state Exp;
branches;
next	1.5;

1.5
date	2002.07.13.23.49.37;	author augur;	state Exp;
branches;
next	1.4;

1.4
date	2002.07.13.23.37.37;	author augur;	state Exp;
branches;
next	1.3;

1.3
date	2001.07.16.17.43.42;	author whmoseley;	state Exp;
branches;
next	1.2;

1.2
date	2001.04.02.01.44.14;	author whmoseley;	state Exp;
branches;
next	1.1;

1.1
date	2000.12.03.22.16.46;	author whmoseley;	state Exp;
branches
	1.1.1.1;
next	;

1.1.1.1
date	2000.12.03.22.16.46;	author whmoseley;	state Exp;
branches;
next	;


desc
@@


1.13
log
@I thought I checked this in already.

This are updates to swishspider and spider.pl to use the OO interface
of SWISH::Filter.
@
text
@#!/usr/bin/perl -w
use strict;

# print STDERR "spider $$ [@@ARGV]\n";

#
# SWISH-E http method Spider
# $Id: swishspider,v 1.12 2003/07/30 05:58:19 whmoseley Exp $ 
#

# Should SWISH::Filter be use for filtering?  This can be left 1 all the time, but
# will add a little time to processing since.

# Note: Just because USE_FILTERS is true doesn't mean SWISH::Filter is in @@INC.
# To get the path to use (for "use lib") run swish-filter-test -path.  Or another way:
#
#  PERL5LIB=`swish-filter-test -path` swish-e -S http -i http://localhost/index.html
#
# The @@INC path is not set by default in swishspider because loading the SWISH::Filter
# modules for every URL might be slow.

use constant USE_FILTERS  => 1;  # 1 = yes use SWISH::Filter for filtering, 0 = no. (faster processing if not set)
use constant FILTER_TEXT  => 0;  # set to one to filter text/* content, 0 will save processing time
use constant DEBUG_FILTER => 0;  # set to one to report errors on loading SWISH::Filter module.

use LWP::UserAgent;
use HTTP::Status;
use HTML::Parser 3.00;
use HTML::LinkExtor;

    if (scalar(@@ARGV) != 2) {
        print STDERR "Usage: $0 localpath url\n";
        exit(1);
    }

    my $ua = new LWP::UserAgent;
    $ua->agent( "SwishSpider http://swish-e.org" );


    my $localpath = shift;
    my $url = shift;

    my $request = new HTTP::Request( "GET", $url );
    my $response = $ua->simple_request( $request );

    # Save the HTTP code, the content/type (or a redirection header), and a last modified date, if one.

    open( RESP, ">$localpath.response" ) || die( "Could not open response file $localpath.response: $!" );
    print RESP $response->code() . "\n";



    # If failed to fetch doc then write out the code and location and exit
    
    if( $response->code != RC_OK ) {
        print RESP ($response->header( "location" ) ||'') . "\n";
        exit;
    }


    # Filter the document, if possible.

    my ( $content_ref, $content_type ) = filter_doc( $response );


    # Write out the (perhaps new) content type and the last modified date.

    print RESP "$content_type\n",
               ($response->last_modified || 0), "\n";

    close RESP;



    # Now write the content -- really only need to do this on text/* types since that's all swish processes
    # No, that's not true.  Can use FileFilter inside of swish-e on binary data.

    open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents: $!\n" );

    # Enable binmode if the contents is not text/*
    binmode CONTENTS unless $content_type =~ m[^text/]i;

    print CONTENTS $$content_ref;
    close( CONTENTS );


    # Finally, extract out links

    exit unless $content_type =~ m!text/html!;

    open( LINKS, ">$localpath.links" ) || die( "Could not open links file $localpath.links: $!\n" );
    my $p = HTML::LinkExtor->new( \&linkcb, $url );

    # Don't allow links above the base
    $URI::ABS_REMOTE_LEADING_DOTS = 1;

    $p->parse( $$content_ref );
    close( LINKS );

    exit;


sub linkcb {
    my($tag, %links) = @@_;

    return unless $tag eq 'a' && $links{href};

    my $link = $links{href};

    # Remove fragments
    $link =~ s/(.*)#.*/$1/;

    print LINKS "$link\n";
}


# This will optionally attempt to filter the document

sub filter_doc {
    my $response = shift;

    my ( $content, $content_type ) = ( $response->content, $response->header( "content-type" ) );

    my $content_ref = \$content;

    unless ( $content_type ) {
        warn 'URL: ', $response->base, " did not return a content-type\n";
        return ( $content_ref, 'text/plain' );
    }


    return ( $content_ref, $content_type ) unless USE_FILTERS;  # filters enabled?


    # This can avoid loading the filter module if it is known that type text/* will never be filtered.
    
    return ( $content_ref, $content_type )
        if $content_type =~ m!^text/! && !FILTER_TEXT;
    

    eval { require SWISH::Filter };
    if ( $@@ ) {
        warn $@@ if DEBUG_FILTER;
        return ( $content_ref, $content_type );
    }

    my $filter = SWISH::Filter->new;

    my $doc = $filter->convert(
        document => $content_ref,
        name     => $response->base,
        content_type => $content_type,
    );

    return $doc && $doc->was_filtered
        ? ( $doc->fetch_doc, $doc->content_type )
        : ( $content_ref, $content_type );
}


@


1.12
log
@
IPC::Open3 reads data in binmode by default (on 5.6.1 under Windows at least).
Now set's binmode FH, ':crlf'.  It's expected that the output from running
a program will be text.

This might not be the case (e.g. unzip/gunzip) -- will fix when that case comes up.
@
text
@d8 1
a8 1
# $Id: swishspider,v 1.11 2003/04/14 16:51:08 whmoseley Exp $ 
d14 8
d149 1
a149 1
    my $filtered = $filter->filter(
d155 2
a156 2
    return $filtered
        ? ( $filter->fetch_doc, $filter->content_type )
@


1.11
log
@
Try using AC_CPPFLAGS instead of the same flags everywhere.

Move delay for swishspider to config.h
@
text
@d8 1
a8 1
# $Id: swishspider,v 1.10 2002/12/06 01:49:42 augur Exp $ 
d68 1
d71 4
d85 4
a103 4

    # Remove ../  This is important because the abs() function
    # can leave these in and cause never ending loops.
    $link =~ s/\.\.\///g;
@


1.10
log
@Changed the default perl hash bang to point at /usr/bin/perl.
@
text
@d8 1
a8 1
# $Id: swishspider,v 1.9 2002/09/09 07:15:19 whmoseley Exp $ 
d24 1
a24 1
        print STDERR "Usage: SwishSpider localpath url\n";
@


1.9
log
@Updated swishspider that uses new filter system
@
text
@d1 1
a1 1
#!/usr/local/bin/perl -w
d8 1
a8 1
# $Id: swishspider,v 1.8 2002/08/15 05:31:02 whmoseley Exp $ 
@


1.8
log
@I made it so it always prints something -- which might be nothing...

If it's a 3xx response and no Location is returned and running with -v3
you will see something like:

URL 'http://foo.com/multi' returned redirect code 300 without a Location.

That seem ok?
@
text
@d2 1
d8 1
a8 1
# $Id: swishspider,v 1.7 2002/08/15 03:20:45 augur Exp $ 
d11 6
a16 1
use strict;
d23 28
a50 4
if (scalar(@@ARGV) != 2) {
    print STDERR "Usage: SwishSpider localpath url\n";
    exit(1);
}
a51 2
my $ua = new LWP::UserAgent;
$ua->agent( "SwishSpider http://swish-e.org" );
d53 1
a53 2
my $localpath = shift;
my $url = shift;
d55 1
a55 2
my $request = new HTTP::Request( "GET", $url );
my $response = $ua->simple_request( $request );
a56 6
#
# Write out important meta-data.  This includes the HTTP code.  Depending on the
# code, we write out other data.  Redirects have the location printed, everything
# else gets the content-type.
#
open( RESP, ">$localpath.response" ) || die( "Could not open response file $localpath.response" );
d58 1
a58 1
print RESP $response->code() . "\n";
d60 2
a61 2
if( $response->code() == RC_OK ) {
    print RESP $response->header( "content-type" ) . "\n";
d63 1
a63 3
} elsif( $response->is_redirect() ) {
    print RESP ($response->header( "location" ) ||'') . "\n";
}
a64 1
print RESP ($response->last_modified || 0), "\n";
a65 1
close( RESP );
d67 1
a67 7
#
# Write out the actual data assuming the retrieval was succesful.  Also, if
# we have actual data and it's of type text/html, write out all the links it
# refers to
#
if( $response->code() == RC_OK ) {
    my $contents = $response->content();
d69 2
a70 2
    open( CONTENTS, ">$localpath.contents" ) || die( "Could not open contents file $localpath.contents\n" );
    print CONTENTS $contents;
a72 4
    if( $response->header("content-type") =~ "text/html" ) {
	open( LINKS, ">$localpath.links" ) || die( "Could not open links file $localpath.links\n" );
	my $p = HTML::LinkExtor->new( \&linkcb, $url );
	$p->parse( $contents );
d74 10
a83 3
	close( LINKS );
    }
}
a87 12
    if (($tag eq "a") && ($links{"href"})) {
	my $link = $links{"href"};
	#
	# Remove fragments
	#
	$link =~ s/(.*)#.*/$1/;
	
	#
	# Remove ../  This is important because the abs() function
	# can leave these in and cause never ending loops.
	#
	$link =~ s/\.\.\///g;
d89 27
a115 1
	print LINKS "$link\n";
d117 28
d146 1
@


1.7
log
@Bugfix?  300 responses may not include a location header, it seems.
@
text
@d7 1
a7 1
# $Id: swishspider,v 1.6 2002/08/14 22:08:48 whmoseley Exp $ 
d42 3
a44 2
} elsif( $response->is_redirect() && $response->header( "location" ) ) {
    print RESP $response->header( "location" ) . "\n";
@


1.6
log
@Updated swishspider

- when not running win32 it forks and exec the swishspider program
- windows now runs simply "perl" and still uses system call -- probably
  unsafe, but what isn't in windows
- now store last modified date when using -S http

parser.c

- added <!-- noindex --> option to try to move toward some standard

various other things that should be obvious
@
text
@d7 1
a7 1
# $Id: swishspider,v 1.5 2002/07/13 23:49:37 augur Exp $ 
d42 1
a42 1
} elsif( $response->is_redirect() ) {
@


1.5
log
@Added comment section with RCS ID and brief description.
Didn't mean to change /usr/local to /usr in the last rev.
@
text
@d3 2
d7 1
a7 1
# $Id:$ 
d39 1
d45 3
a75 1

a86 3
	# hack for apostrophe -- changes URL, but should work for most clients.
	$link =~ s/'/%27/g;
	
@


1.4
log
@Require HTML::Parser 3.0 or newer.  Older HTTP::Parser modules seem to
hang a very long time (5-10 minutes) using excessive CPU time.

References:
  http://swish-e.org/archive/4012.html
  http://swish-e.org/archive/4017.html
  http://swish-e.org/archive/4019.html
@
text
@d1 7
a7 1
#!/usr/bin/perl -w
@


1.3
log
@"fix" swishspider to escape "'"
and/or minor updates to comments in other files
@
text
@d1 1
a1 1
#!/usr/local/bin/perl -w
d6 1
@


1.2
log
@Fixed content-type check
@
text
@d1 2
a2 1
#!/usr/local/bin/perl
a4 2
use LWP::RobotUA;
use HTTP::Request;
d14 1
a14 2
$ua->agent( "SwishSpider" );
$ua->from( "ron\@@ckm.ucsf.edu" );
d28 1
d51 1
a51 1
	$p = HTML::LinkExtor->new( \&linkcb, $url );
d74 3
@


1.1
log
@Initial revision
@
text
@d50 1
a50 1
    if( $response->header("content-type") eq "text/html" ) {
@


1.1.1.1
log
@Inital import of 2.1.10-dev
@
text
@@
