Marvin

Marvin fetches press release pages, compares them to the same page from yesterday and if they have changed, extracts the title and link to the latest press release. The title and link are then folded into a template using server-side includes.

#!/bin/sh

#copy yesterday's files into the "old" directory"
cp *.html old

#get the pages. 
/path/to/curl -o cn.html http://www.cn.ca/cnwebsite/cnwebsite.nsf/public/en_NewsPressReleases
/path/to/curl -o amtk.html http://www.amtrak.com/news/archive/oct99.html

#done. now the shellscript (above) invokes the perlscript (below) 
#which compares the files and extracts the information

#!/usr/bin/perl -w 

open TEMPLATE, ">template.shtml"; 
print TEMPLATE "<!--\#include virtual=\"header.rail\" -->\n\n"; 
print TEMPLATE "See also the complete list on a <a href=\"pr.html\">separate page</a>.<br>Last updated: <!--\#echo var=\"LAST_MODIFIED\" --><br>\n\n"; 
close TEMPLATE; 

@newlist = `ls *.html`; 
@oldlist = `ls old/*.html`;
$no = 0; 


#keep going untill all 45 pages have been checked for changes
while ($no <= 45) {

	open NEWFILE, "$newlist[$no]"; 
	@newfile = <NEWFILE>; 
	close NEWFILE; 
	$newfile = join " ", @newfile; 
	$newlength = ($newfile =~ s/\w//gs); 

	open OLDFILE, "$oldlist[$no]"; 
	@oldfile = <OLDFILE>; 
	close OLDFILE; 
	$oldfile = join " ", @oldfile; 
	$oldlength = ($oldfile =~ s/\w//gs); 

#count the number of words. if there is a difference of 2 words 
#or more, the page has probably changed and there is 
#a new press release
	$diff = $newlength - $oldlength; 
	$absdiff = abs $diff; 

	if ($absdiff > 2){

		open GREPLIST, "grep.list";  
		@greplist = <GREPLIST>;  
		print `$greplist[$no]`;  
 
		open SPACERLIST, "spacer.list";  
		@spacerlist = <SPACERLIST>;  
		print `$spacerlist[$no]`;  
		print `$spacerlist[$no]`;   
 
		open LINKLIST, "link.list";  
		@linklist = <LINKLIST>;  
		print `$linklist[$no]`;  

		open EDITLIST, "edit.list";  
		@editlist = <EDITLIST>;  
		print `$editlist[$no]`;  

		open STRIPLIST, "strip.list";  
		@striplist = <STRIPLIST>;  
		print `$striplist[$no]`; 

		open EDITLIST2, "edit2.list";  
		@editlist2 = <EDITLIST2>;  
		print `$editlist2[$no]`;  

		open TEMPLATELIST, "template.list"; 
		@templatelist = <TEMPLATELIST>;  
		open TEMPLATE, ">>template.shtml"; 
		print TEMPLATE "$templatelist[$no]\n"; 
		close TEMPLATE; 
	}
	$no++; 
}

open TEMPLATE, ">>template.shtml"; 
print TEMPLATE "<hr><h2>Recent Accidents</h2>\n"; 
print TEMPLATE "<p>From <a href=\"http://danger-ahead.railfan.net/\">Danger Ahead</a></p><p>\n"; 
print TEMPLATE "<!--\#include virtual=\"david\" -->"; 
print TEMPLATE "</body></html>"; 
close TEMPLATE; 

Below is the link extractor.

#!/usr/bin/perl

# programmet skrivet av Staffan, staham@algonet.se
# program written by Staffan, staham@algonet.se

if ($#ARGV < 0)
{
        die "Usage: htmltest file(s)\n ";
}
foreach $file (@ARGV)
{
        open( infile, $file ) || die "Open $file - $!\n";

        while ($ln = <infile> )
        {
                $ln =~ s/[\n\r]//g;

                if($ln =~ /<a href=/i)
                {
                        $ln =~ s/.*<a href="(.*?)".*/\1/gi;
                        print "$ln\n";
                }
        }
}

"Is there any further service you would like me to perform for your perhaps?" the super-intelligent robot Marvin asked Arthur and Fenchurch. Marvin had been taken along on countless travels through time and as such had been in existence for an inconceivably long time. "A piece of paper you would like me to pick up for you? Or maybe you would like me," Marvin continued, "to open a door?"