package Lire::WWW::UserAgent::Robot;
#
# $Id: Robot.pm,v 1.3 2006/07/23 13:16:36 vanbaal Exp $
#
# The data in this module is taken from http://www.robotstxt.org/wc/active.html.
# In this message:
#
#  From: Egon Willighagen <egonw@sci.kun.nl>
#  To: m.koster@greenhills.co.uk
#  Subject: Copyright on robots.txt
#  Date: Mon, 24 Sep 2001 09:53:00 +0200
#  Cc: joostvb@logreport.org
#  Message-Id: <E15lQY0-0002Xe-00@garak.sci.kun.nl>
#
# we asked Martijn Koster, the publisher of the data for the conditions on
# distributing the data. His aswer: "Sure".
#
# Further discussion ended with Martijn's statement:
# "I don't want you modifying this data and then naming it in such
#  a way that confuses people into thinking they are looking at my
#  data. So be clear where you modify things."
#
# This is basic "fair use". The reply that it was not our intention
# to change the data other than removal of typo's, was not answered.
# (From: Egon Willighagen <egonw@sci.kun.nl>
#  Reply-To: egonw@sci.kun.nl
#  To: "Martijn Koster" <m.koster@greenhills.co.uk>
#  Subject: Re: Copyright on robots.txt
#  Date: Mon, 24 Sep 2001 22:18:12 +0200
#  References: <E15lQY0-0002Xe-00@garak.sci.kun.nl> <E15lW7f-0002ei-00@garak.sci.kun.nl> <052d01c1451b$0343e7e0$b6c2a8c0@home.greenhills.co.uk>
#  In-Reply-To: <052d01c1451b$0343e7e0$b6c2a8c0@home.greenhills.co.uk>
# )
#
# Copyright (C) Martijn Koster
# Copyright (C) 2001, 2002, 2003, 2004 Stichting LogReport Foundation <logreport@logreport.org>
#
#     This file is part of Lire.
#
#     Lire is free software; you can redistribute it and/or modify
#     it under the terms of the GNU General Public License as published by
#     the Free Software Foundation; either version 2 of the License, or
#     (at your option) any later version.
#
#     This program is distributed in the hope that it will be useful,
#     but WITHOUT ANY WARRANTY; without even the implied warranty of
#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#     GNU General Public License for more details.
#
#     You should have received a copy of the GNU General Public License
#     along with this program (see COPYING); if not, check with
#     http://www.gnu.org/copyleft/gpl.html.
#
use strict;
use vars qw/ $REVISION/;

{
    $REVISION  = '$Id: Robot.pm,v 1.3 2006/07/23 13:16:36 vanbaal Exp $';
}

# -----------------------------------------------------------------------------
# Database
# -----------------------------------------------------------------------------

my %MozCompatRobotHash = (
# these are robots/crawlers/spiders whos user agent string starts with
# Mozilla/x.0_(compatible
"yahoo!_slurp"          =>      "Yahoo! Slurp",
"grub_client"           =>      "Grub.org",
);

my %RobotHash = (
"msnbot"                =>      "MSNBot",
"dumbot"                =>      "Dumbot",
"vagabondo"             =>      "Vagabondo",

"fast-webcrawler" 	=> 	"FAST-WebCrawler",
"aranha" 		=> 	"Arahna Robot",
"baiduspider" 		=> 	"BaiDuSpider",
"diibot" 		=> 	"DIIbot",
"fastcrawler"		=>	"FastCrawler",
"fast"			=>	"FastCrawler",
"gaislab"		=>      "GaisLab data gatherer",
"ingrid"		=>      "Ilse Robot",
"imagecollector"        =>	"ImageCollector",
"infonavirobot"		=>	"InfoNaviRobot",
"explorer"		=>	"Explorer",
"jack"			=>	"Domanova Search",
"jennybot"		=>	"JennyBot",
"mercator"		=>	"Mercator",
"mercator-scrub"	=>	"Mercator",
"ezresult"		=>	"EZResult -- Internet Search Engine",
"perman"		=>	" PerMan Surfer",
"ask" 			=>	"Jeeves",
"links2go"		=>	"Links2Go Similarity Engine",
"flipdog"		=>	"FlipDog Job Search Engine",
"msiecrawler"		=>	"MSIECrawler",
"zyborg"		=>	"WiseNutBot",
"httrack"		=>	"HTTrack",
"mumpitscrawler"	=>	"MumpitsCrawler",
"myfinances"		=>	"parallelContextFocusCrawler",
"ng"			=>	"NG",
"nokia-waptoolkit"	=>	"Nokia-WAPToolkit",
"pcbeaconbot"		=>	"PCBeaconBot",
"slysearch"		=>	"SlySearch",
"villspider"		=>	"VillSpider",
"wfarc"			=>	"WFARC",
"webcraft"		=>	"WebCraft",
"webfountain"		=>	"WebFountain",
"bumblebee"		=>	"bumblebee",
"flunky"		=>	"flunky",
"ia"			=>	"ia_archiver",
"nabot"			=>	"nabot",
"ndobot"		=>	"ndoBot",
"parallelcontextfocuscrawler" => "parallelContextFocusCrawler",
"polybot"		=>	"polybot",
"psbot"			=>	"psbot",
"ru-robot"		=>	"ru-robot",
"speedfind"		=>	"speedfind",
"tivraspider"		=>	"tivraSpider",
"vspider"		=>	"vspider",

"scooter-w" 		=>      "Scooter",
"scooter-venus"		=>	"Scooter",
"compete"		=>	"larbin",

"acme.spider" => "Acme.Spider",
"ahoythehomepagefinder" => "Ahoy! The Homepage Finder",
"alkaline" => "Alkaline",
"alkalinebot" => "AlkalineBOT",
"appie" => "Walhello appie",
"arachnophilia" => "Arachnophilia",
"architext" => "ArchitextSpider",
"aretha" => "Aretha",
"ariadne" => "ARIADNE",
"arks" => "arks",
"aspider" => "ASpider (Associative Spider)",
"atn.txt" => "ATN Worldwide",
"atomz" => "Atomz.com Search Robot",
"auresys" => "AURESYS",
"backrub" => "BackRub",
"bayspider" => "bayspider",
"bigbrother" => "Big Brother",
"bjaaland" => "Bjaaland",
"blackwidow" => "BlackWidow",
"blindekuh" => "Die Blinde Kuh",
"bloodhound" => "Bloodhound",
"brightnet" => "bright.net caching robot",
"bspider" => "BSpider",
"cactvschemistryspider" => "CACTVS Chemistry Spider",
"calif" => "Calif",
"cassandra" => "Cassandra",
"cgireader" => "Digimarc Marcspider/CGI",
"checkbot" => "Checkbot",
"churl" => "churl",
"cmc" => "CMC/0.01",
"collective" => "Collective",
"combine" => "Combine System",
"conceptbot" => "Conceptbot",
"coolbot" => "CoolBot",
"core" => "Web Core / Roots",
"cosmos" => "XYLEME Robot",
"cruiser" => "Internet Cruiser Robot",
"cusco" => "Cusco",
"cyberspyder" => "CyberSpyder Link Test",
"deweb" => "DeWeb(c) Katalog/Index",
"dienstspider" => "DienstSpider",
"digger" => "Digger",
"diibot" => "Digital Integrity Robot",
"directhit" => "Direct Hit Grabber",
"dnabot" => "DNAbot",
"download_express" => "DownLoad Express",
"dragonbot" => "DragonBot",
"dwcp" => "DWCP (Dridus' Web Cataloging Project)",
"e-collector" => "e-collector",
"ebiness" => "EbiNess",
"eit" => "EIT Link Verifier Robot",
"elfinbot" => "ELFINBOT",
"emacs" => "Emacs-w3 Search Engine",
"emcspider" => "ananzi",
"esther" => "Esther",
"evliyacelebi" => "Evliya Celebi",
"nzexplorer" => "nzexplorer",
"fdse" => "Fluid Dynamics Search Engine robot",
"felix" => "Felix IDE",
"ferret" => "Wild Ferret Web Hopper #1, #2, #3",
"fetchrover" => "FetchRover",
"fido" => "fido",
"finnish" => "Hmhkki",
"fireball" => "KIT-Fireball",
"fish" => "Fish search",
"fouineur" => "Fouineur",
"francoroute" => "Robot Francoroute",
"freecrawl" => "Freecrawl",
"funnelweb" => "FunnelWeb",
"gama" => "gammaSpider, FocusedCrawler",
"gazz" => "gazz",
"gcreep" => "GCreep",
"getbot" => "GetBot",
"geturl" => "GetURL",
"golem" => "Golem",
"googlebot" => "Googlebot",
"grapnel" => "Grapnel/0.01 Experiment",
"griffon" => "Griffon",
"gromit" => "Gromit",
"gulliver" => "Northern Light Gulliver",
"hambot" => "HamBot",
"harvest" => "Harvest",
"havindex" => "havIndex",
"hi" => "HI (HTML Index) Search",
"hometown" => "Hometown Spider Pro",
"wired-digital" => "Wired Digital",
"htdig" => "ht:\/\/Dig",
"htmlgobble" => "HTMLgobble",
"hyperdecontextualizer" => "Hyper-Decontextualizer",
"iajabot" => "iajaBot",
"ibm" => "IBM_Planetwide",
"iconoclast" => "Popular Iconoclast",
"ilse" => "Ingrid",
"imagelock" => "Imagelock ",
"incywincy" => "IncyWincy",
"informant" => "Informant",
"infoseek" => "InfoSeek Robot 1.0",
"infoseeksidewinder" => "Infoseek Sidewinder",
"infospider" => "InfoSpiders",
"inspectorwww" => "Inspector Web",
"intelliagent" => "IntelliAgent",
"irobot" => "I, Robot",
"iron33" => "Iron33",
"israelisearch" => "Israeli-search",
"javabee" => "JavaBee",
"jbot" => "JBot Java Web Robot",
"jcrawler" => "JCrawler",
"jeeves" => "Jeeves",
"jobo" => "JoBo Java Web Robot",
"jobot" => "Jobot",
"joebot" => "JoeBot",
"jubii" => "The Jubii Indexing Robot",
"jumpstation" => "JumpStation",
"katipo" => "Katipo",
"kdd" => "KDD-Explorer",
"kilroy" => "Kilroy",
"ko_yappo_robot" => "KO_Yappo_Robot",
"labelgrabber.txt" => "LabelGrabber",
"larbin" => "larbin",
"legs" => "legs",
"linkidator" => "Link Validator",
"linkscan" => "LinkScan",
"linkwalker" => "LinkWalker",
"lockon" => "Lockon",
"logo_gif" => "logo.gif Crawler",
"lycos" => "Lycos",
"macworm" => "Mac WWWWorm",
"magpie" => "Magpie",
"marvin" => "marvin\/infoseek",
"mattie" => "Mattie",
"mediafox" => "MediaFox",
"merzscope" => "MerzScope",
"meshexplorer" => "NEC-MeshExplorer",
"mindcrawler" => "MindCrawler",
"moget" => "moget",
"momspider" => "MOMspider",
"monster" => "Monster",
"motor" => "Motor",
"muscatferret" => "Muscat Ferret",
"mwdsearch" => "Mwd.Search",
"myweb" => "Internet Shinchakubin",
"netcarta" => "NetCarta WebMap Engine",
"netmechanic" => "NetMechanic",
"netscoop" => "NetScoop",
"newscan-online" => "newscan-online",
"nhse" => "NHSE Web Forager",
"nomad" => "Nomad",
"northstar" => "The NorthStar Robot",
"occam" => "Occam",
"octopus" => "HKU WWW Octopus",
"openfind" => "Openfind data gatherer",
"orb_search" => "Orb Search",
"packrat" => "Pack Rat",
"pageboy" => "PageBoy",
"parasite" => "ParaSite",
"patric" => "Patric",
"pegasus" => "pegasus",
"perignator" => "The Peregrinator",
"perlcrawler" => "PerlCrawler 1.0",
"phantom" => "Phantom",
"piltdownman" => "PiltdownMan",
"pimptrain" => "Pimptrain.com's robot",
"pioneer" => "Pioneer",
"pitkow" => "html_analyzer",
"pjspider" => "Portal Juice Spider",
"pka" => "PGP Key Agent",
"plumtreewebaccessor" => "PlumtreeWebAccessor ",
"poppi" => "Poppi",
"portalb" => "PortalB Spider",
"puu" => "GetterroboPlus Puu",
"python" => "The Python Robot",
"raven" => "Raven Search",
"rbse" => "RBSE Spider",
"resumerobot" => "Resume Robot",
"rhcs" => "RoadHouse Crawling System",
"roadrunner" => "Road Runner: The ImageScape Robot",
"robbie" => "Robbie the Robot",
"robi" => "ComputingSite Robi/1.0",
"robofox" => "RoboFox",
"robozilla" => "Robozilla",
"roverbot" => "Roverbot",
"rules" => "RuLeS",
"safetynetrobot" => "SafetyNet Robot",
"scooter" => "Scooter",
"search_au" => "Search.Aus-AU.COM",
"searchprocess" => "SearchProcess",
"senrigan" => "Senrigan",
"sgscout" => "SG-Scout",
"shaggy" => "ShagSeeker",
"shaihulud" => "Shai'Hulud",
"sift" => "Sift",
"simbot" => "Simmany Robot Ver1.0",
"site-valet" => "Site Valet",
"sitegrabber" => "Open Text Index Robot",
"sitetech" => "SiteTech-Rover",
"slcrawler" => "SLCrawler",
"slurp" => "Inktomi Slurp",
"smartspider" => "Smart Spider",
"snooper" => "Snooper",
"solbot" => "Solbot",
"spanner" => "Spanner",
"speedy" => "Speedy Spider",
"spider_monkey" => "spider_monkey",
"spiderbot" => "SpiderBot",
"spiderline" => "Spiderline Crawler",
"spiderman" => "SpiderMan",
"spiderview" => "SpiderView(tm)",
"spry" => "Spry Wizard Robot",
"ssearcher" => "Site Searcher",
"suke" => "Suke",
"suntek" => "suntek search engine",
"sven" => "Sven",
"tach_bw" => "TACH Black Widow",
"tarantula" => "Tarantula",
"tarspider" => "tarspider",
"tcl" => "Tcl W3 Robot",
"techbot" => "TechBOT",
"templeton" => "Templeton",
"teoma" => "TeomaTechnologies",
"teomaagent" => "TeomaTechnologies",
"titin" => "TitIn",
"titan" => "TITAN",
"tkwww" => "The TkWWW Robot",
"tlspider" => "TLSpider",
"ucsd" => "UCSD Crawl",
"udmsearch" => "UdmSearch",
"urlck" => "URL Check",
"us" => "URL Spider Pro",
"valkyrie" => "Valkyrie",
"victoria" => "Victoria",
"visionsearch" => "vision-search",
"voyager" => "Voyager",
"vwbot" => "VWbot",
"w3index" => "The NWI Robot",
"w3m2" => "W3M2",
"wallpaper" => "WallPaper",
"wanderer" => "the World Wide Web Wanderer",
"wapspider" => "w\@pSpider by wap4.com",
"webbandit" => "WebBandit Web Spider",
"webcatcher" => "WebCatcher",
"webcopy" => "WebCopy",
"webfetcher" => "webfetcher",
"webfoot" => "The Webfoot Robot",
"weblayers" => "weblayers",
"weblinker" => "WebLinker",
"webmirror" => "WebMirror",
"webmoose" => "The Web Moose",
"webquest" => "WebQuest",
"webreader" => "Digimarc MarcSpider",
"webreaper" => "WebReaper",
"webs" => "webs",
"websnarf" => "Websnarf",
"webspider" => "WebSpider",
"webvac" => "WebVac",
"webwalk" => "webwalk",
"webwalker" => "WebWalker",
"webwatch" => "WebWatch",
"whatuseek" => "whatUseek Winona",
"whowhere" => "WhoWhere Robot",
"wmir" => "w3mir",
"wolp" => "WebStolperer",
"wombat" => "The Web Wombat ",
"worm" => "The World Wide Web Worm",
"wwwc" => "WWWC Ver 0.2.5",
"wz101" => "WebZinger",
"xget" => "XGET",
"nederland.zoek" => "Nederland.zoek"
);

# -----------------------------------------------------------------------------
# Functions
# -----------------------------------------------------------------------------

sub getRobot {
    my ( $ua ) = @_;
    $ua = lc $ua;

	my ($m)= $ua=~/^([a-z]*[\.-]*[a-z]*).*/;
	($m) = $ua=~/^mozilla.*\(\w*;\s*([a-z]*[\.-]*[a-z]*).*/
            if ($m=~/mozilla/);
	($m) = $ua=~/^mozilla.*\(*([a-z]*[\.-]*[a-z]*).*/
            if (defined($m) and length($m)==0);
	($m) = $ua=~/^mozilla.*\s+comp\w+\s+([a-z]*[\.-]*[a-z]*).*/
            if (defined($m) and length($m)==0);
	if (not defined($m)) {
		# ok, but there *are* some crawlers/bots/spiders that
		# *do* use the 'Mozilla/X.0_(compatible' syntax
		foreach my $robot (keys %MozCompatRobotHash) {
			return $MozCompatRobotHash{$robot} if index( $ua, $robot ) >= 0;
		}
		return undef;
	}
	if ($m=~/msi/){
		return "MS Search" if $ua=~/ms\s+search/i;
		return "Site Server" if $ua=~/site\s+server/i;
		return "MSIECrawler" if $ua=~/msicrawler/i;
		return "TrueRobot" if $ua=~/truerobot/i;
	}

	if ($m=~/http/ or $m=~/www/){
		return "Eidetica web-crawler" if $ua=~/eidetica/i;
		return "IBM Web Crawler" if $ua=~/ibm/i;
	}
	return $RobotHash{$m} if exists($RobotHash{$m});

	return undef;
}

1;

