#!/usr/bin/perl
# Copyright (c) 2007-2008 www.jeganl.com. All rights reserved.
# All source code and material located at the Internet address of
# http://www.jeganl.com is protected under copyright laws of the United States.
# This source code may not be hosted on any other site without my express, prior,
# written permission. Application to host any of the material elsewhere can be
# made by contacting me.
# I have made every effort and taken great care in making sure that the source
# code and other content included on my web site is technically accurate, but I
# disclaim any and all responsibility for any loss, damage or destruction of
# data or any other property which may arise from relying on it. I will in no
# case be liable for any monetary damages arising from such loss, damage or
# destruction.
#print "Content-type: text/html\n\n";
if(eval{require LWP::Simple;}){
}else{
print "You need to install the Perl LWP module!
";
exit;
}
use LWP::Simple qw($ua get);
open (PRG_OUT_FILE, '>prg.out');
#@proxyservers=("200.83.4.60:80","200.181.107.103:80","85.214.126.154:80","66.7.195.129:80","159.145.15.101:80","70.87.7.56:80","208.122.34.234:80","219.87.131.104:80","66.46.148.201:80","192.115.104.88:80");
@proxyservers= ("200.83.4.60:80","80.58.205.61:80","200.65.0.25:80","222.124.193.125:8080","202.105.182.12:80","122.153.173.178:3128","213.114.118.44:8080","85.198.50.2:80","192.146.7.18:80","122.153.173.178:3128");
$page_got=0;
#------First Trying to get page source directly
$page = get 'http://movies.yahoo.com';
$match_found = index $page, "
Top Box Office
";
if($match_found == -1)
{
#---- Try using Proxy Servers
for ($count = 0; $count < 10; $count++)
{
if($page_got==0)
{
print PRG_OUT_FILE "trying proxy:$count $proxyservers[$count] \n";
$ua->timeout(5);
$proxyserverurl = "http://" . $proxyservers[$count];
$ua->proxy('http',$proxyserverurl);
$page ="";
$page = get 'http://movies.yahoo.com';
#print PRG_OUT_FILE $page;
$match_found = index $page, "Top Box Office
";
print PRG_OUT_FILE "match_found : $match_found \n";
if($match_found != -1)
{
print PRG_OUT_FILE "Match Found : $match_found : proxy:$count $proxyservers[$count] \n";
$page_got=1;
}
}
}
}
else
{
print PRG_OUT_FILE "direct Connection Worked \n";
#print "direct Conn";
}
#print PRG_OUT_FILE $page;
$sleep_timer=1;
open (MYFILE, '>../htdocs/hw6/moviedetails.xml');
print MYFILE "";
print MYFILE "\n";
print MYFILE "";
print MYFILE "\n";
#------------------------------------------ Top Box Office Starts--------------------------------------------
print MYFILE "";
print MYFILE "\n";
print MYFILE "";
print MYFILE "\n";
$startLoc = index $page, "Top Box Office
";
$ind = $startLoc;
$endLoc = index lc($page), "", $ind+1;
$TopBoxOfficeMovies = substr $page, $startLoc, $endLoc - $startLoc + 5;
$startLoc = index lc($TopBoxOfficeMovies), "";
$ind = $startLoc;
$endLoc = index lc($TopBoxOfficeMovies), "
", $ind+1;
$TopBoxOfficeMovies = substr $TopBoxOfficeMovies, $startLoc, $endLoc - $startLoc + 5;
#print $TopBoxOfficeMovies;
$ind = index lc($TopBoxOfficeMovies), "", $ind+1;
$TopBoxOfficeIndividualMovies = substr $TopBoxOfficeMovies, $startLoc, $endLoc - $startLoc + 5;
#print $TopBoxOfficeIndividualMovies . "\n";
$startLoc = index lc($TopBoxOfficeIndividualMovies), "http";
$subind = $startLoc;
$endLoc = index lc($TopBoxOfficeIndividualMovies), ">", $subind +1;
sleep sleep_timer;
&getIndividualMovieDetails(substr $TopBoxOfficeIndividualMovies, $startLoc, $endLoc - $startLoc -1);
$ind = index lc($TopBoxOfficeMovies), "";
print MYFILE "\n";
print MYFILE "";
print MYFILE "\n";
#------------------------------------------ Opening this Week Starts--------------------------------------------
print MYFILE "";
print MYFILE "\n";
print MYFILE "";
print MYFILE "\n";
$startLoc = index $page, "Opening this Week
";
$ind = $startLoc;
$endLoc = index lc($page), "", $ind+1;
$OpeningthisWeekMovies = substr $page, $startLoc, $endLoc - $startLoc + 5;
$startLoc = index lc($OpeningthisWeekMovies), "";
$ind = $startLoc;
$endLoc = index lc($OpeningthisWeekMovies), "
", $ind+1;
$OpeningthisWeekMovies = substr $OpeningthisWeekMovies, $startLoc, $endLoc - $startLoc + 5;
$ind = index lc($OpeningthisWeekMovies), "", $ind+1;
$OpeningthisWeekIndividualMovies = substr $OpeningthisWeekMovies, $startLoc, $endLoc - $startLoc + 5;
$startLoc = index lc($OpeningthisWeekIndividualMovies), "http";
$subind = $startLoc;
$endLoc = index lc($OpeningthisWeekIndividualMovies), "";
print MYFILE "\n";
print MYFILE "";
print MYFILE "\n";
#------------------------------------------ Coming Soon Starts--------------------------------------------
print MYFILE "";
print MYFILE "\n";
print MYFILE "";
print MYFILE "\n";
$startLoc = index $page, "Coming Soon
";
$ind = $startLoc;
$endLoc = index lc($page), "", $ind+1;
$ComingSoonMovies = substr $page, $startLoc, $endLoc - $startLoc + 5;
$startLoc = index lc($ComingSoonMovies), "";
$ind = $startLoc;
$endLoc = index lc($ComingSoonMovies), "
", $ind+1;
$ComingSoonMovies = substr $ComingSoonMovies, $startLoc, $endLoc - $startLoc + 5;
$ind = index lc($ComingSoonMovies), "", $ind+1;
$ComingSoonIndividualMovies = substr $ComingSoonMovies, $startLoc, $endLoc - $startLoc + 5;
$startLoc = index lc($ComingSoonIndividualMovies), "http";
$subind = $startLoc;
$endLoc = index lc($ComingSoonIndividualMovies), "";
print MYFILE "\n";
print MYFILE "";
print MYFILE "\n";
print MYFILE "";
print MYFILE "\n";
close (MYFILE);
close (PRG_OUT_FILE);
#----Navigate to new page
$url = "http://csci571.usc.edu:17074/hw6/jwebtech_hw6_2.htm";
print ("Location: $url\n\n");
exit;
#------------------------------------SubRoutine Starts getIndividualMovieDetails---------------------------------------------
sub getIndividualMovieDetails
{
$MovieName="";
$GenreName="";
$RunningTime="";
$ReleaseDate="";
$MPAARating="";
$StarringFinalString="";
$DirectedByFinalString="";
$FullDetailsUrl="";
$CriticsReviewsUrl="";
$IsSpecialMovie=0;
$IsGenrePresent=0;
$movieURL = "@_";
#print $movieURL;
#print "\n";
my $movieDetailPage = get $movieURL;
#print $movieDetailPage;
$startLoc = index lc($movieDetailPage), "";
$endLoc = index lc($movieDetailPage), "";
$MovieNameSubstring = substr $movieDetailPage, $startLoc, $endLoc - $startLoc;
$startLoc = index lc($MovieNameSubstring), "";
$subIndivind = $startLoc;
$endLoc = index $MovieNameSubstring, ")", $subIndivind+1;
if ($endLoc == -1 ) {
#print "\n" ;
#print $MovieNameSubstring ;
#print "\n" ;
$startLocTemp = index lc($MovieNameSubstring), "";
$endLocTemp = index $MovieNameSubstring, "on Yahoo! Movies";
$MovieNameSubstring = substr $MovieNameSubstring, $startLocTemp + 7, $endLocTemp - $startLocTemp -8;
$MovieName=$MovieNameSubstring ;
}
$IsSpecialMovie=$endLoc;
if ($IsSpecialMovie != -1) {
#------------------------------------MovieName ---------------------------------------------
$MovieName = substr $MovieNameSubstring, $startLoc + 7, $endLoc - $startLoc + 1 - 7;
#------------------------------------MovieName ---------------------------------------------
$startLoc = index $movieDetailPage, "Genres";
if ($startLoc != -1) {
$subIndivind = $startLoc;
$endLoc = index lc($movieDetailPage), "", $subIndivind+1;
$GenreWholeString = substr $movieDetailPage, $startLoc, $endLoc - $startLoc;
$startLoc = index lc($GenreWholeString), "", $subIndivind+1;
$GenreSubString1 = substr $GenreWholeString, $startLoc, $endLoc - $startLoc + 1;
$startLoc = index lc($GenreSubString1), "", $subIndivind+1;
$GenreSubString2 = substr $GenreSubString1, $startLoc, $endLoc - $startLoc;
$startLoc = index $GenreSubString2, ">";
$subIndivind = $startLoc;
#------------------------------------GenreName ---------------------------------------------
$GenreName = substr $GenreSubString2, $startLoc +1, length($GenreSubString2) ;
#------------------------------------GenreName ---------------------------------------------
}
else
{
$IsGenrePresent=-1;
}
$startLoc = index $movieDetailPage, "Running Time";
if ($startLoc!= -1) {
$subIndivind = $startLoc;
$endLoc = index lc($movieDetailPage), "", $subIndivind+1;
$RunningTimeWholeString = substr $movieDetailPage, $startLoc, $endLoc - $startLoc;
$startLoc = index lc($RunningTimeWholeString), "", $subIndivind+1;
$RunningTimeSubString1 = substr $RunningTimeWholeString, $startLoc, $endLoc - $startLoc +1;
$startLoc = index lc($RunningTimeSubString1), "", $subIndivind+1;
$RunningTimeSubString2 = substr $RunningTimeSubString1, $startLoc, $endLoc - $startLoc;
$startLoc = index $RunningTimeSubString2, ">";
$subIndivind = $startLoc;
#------------------------------------RunningTime ---------------------------------------------
$RunningTime = substr $RunningTimeSubString2, $startLoc +1, length($RunningTimeSubString2);
}
#------------------------------------RunningTime ---------------------------------------------
$startLoc = index $movieDetailPage, "Release Date";
if ($startLoc != -1) {
$subIndivind = $startLoc;
$endLoc = index lc($movieDetailPage), "", $subIndivind+1;
$ReleaseDateWholeString = substr $movieDetailPage, $startLoc, $endLoc - $startLoc;
$startLoc = index lc($ReleaseDateWholeString), "", $subIndivind+1;
$ReleaseDateSubString1 = substr $ReleaseDateWholeString, $startLoc, $endLoc - $startLoc + 1;
$startLoc = index lc($ReleaseDateSubString1), "", $subIndivind+1;
$ReleaseDateSubString2 = substr $ReleaseDateSubString1, $startLoc, $endLoc - $startLoc;
$startLoc = index $ReleaseDateSubString2, ">";
$subIndivind = $startLoc;
#------------------------------------ReleaseDate ---------------------------------------------
$ReleaseDate = substr $ReleaseDateSubString2, $startLoc +1, length($ReleaseDateSubString2);
#------------------------------------ReleaseDate ---------------------------------------------
}
$startLoc = index $movieDetailPage, "MPAA Rating";
if ($startLoc != -1) {
$subIndivind = $startLoc;
$endLoc = index lc($movieDetailPage), "", $subIndivind+1;
$MPAARatingWholeString = substr $movieDetailPage, $startLoc, $endLoc - $startLoc;
$startLoc = index lc($MPAARatingWholeString), "", $subIndivind+1;
$MPAARatingSubString1 = substr $MPAARatingWholeString, $startLoc, $endLoc - $startLoc + 1;
$startLoc = index lc($MPAARatingSubString1), "", $subIndivind+1;
$MPAARatingSubString2 = substr $MPAARatingSubString1, $startLoc, $endLoc - $startLoc;
$startLoc = index $MPAARatingSubString2, ">";
$subIndivind = $startLoc;
#------------------------------------MPAARating ---------------------------------------------
$MPAARating = substr $MPAARatingSubString2, $startLoc +1, length($MPAARatingSubString2);
}
#------------------------------------MPAARating ---------------------------------------------
$startLoc = index $movieDetailPage, "Starring";
if ($startLoc != -1) {
$subIndivind = $startLoc;
$endLoc = index lc($movieDetailPage), "", $subIndivind+1;
$StarringWholeString = substr $movieDetailPage, $startLoc, $endLoc - $startLoc;
$Starringind = index lc($StarringWholeString), "", $Starringind + 1;
$StarringSubString = substr $StarringWholeString, $startLoc, $endLoc - $startLoc ;
$startLoc = index $StarringSubString, ">";
if ($i !=1) {
$StarringFinalString = $StarringFinalString . "," . substr $StarringSubString, $startLoc +1, length($StarringSubString);
}
else
{
$StarringFinalString = substr $StarringSubString, $startLoc +1, length($StarringSubString);
}
$i=$i+1;
$Starringind = index lc($StarringWholeString), "", $subIndivind+1;
$DirectedByWholeString = substr $movieDetailPage, $startLoc, $endLoc - $startLoc;
$DirectedByind = index lc($DirectedByWholeString), "", $DirectedByind + 1;
$DirectedBySubString = substr $DirectedByWholeString, $startLoc, $endLoc - $startLoc ;
$startLoc = index $DirectedBySubString, ">";
if ($i !=1) {
$DirectedByFinalString = $DirectedByFinalString . "," . substr $DirectedBySubString, $startLoc +1, length($DirectedBySubString);
}
else
{
$DirectedByFinalString = substr $DirectedBySubString, $startLoc +1, length($DirectedBySubString);
}
$i=$i+1;
$DirectedByind = index lc($DirectedByWholeString), "", $startLoc ;
$CriticsReviewsUrl = substr $movieDetailPage, $startLoc + 6, $endLoc - $startLoc -7;
}
else
{
$CriticsReviewsUrl = "";
}
}
#------------------------------------CriticsReviewsUrl ---------------------------------------------
#-------------Writing to XML File---------------------------
}
$MovieName =~ s/&/&/g;
$MovieName =~ s/'/'/g;
$MovieName =~ s/"/"/g;
$MovieName =~ s/</g;
$MovieName =~ s/>/>/g;
print MYFILE "" ;
print MYFILE "\n";
#---- To Split GenreName
#print MYFILE "" . $GenreName . "" ;
if ($IsGenrePresent !=-1) {
if ($IsSpecialMovie != -1)
{
@GenreNameArray = split(/,/, $GenreName);
foreach $GenreSplitName (@GenreNameArray)
{
$endLoc = index $GenreSplitName, " and ";
if ($endLoc != -1) {
$startLoc = 0;
$firstPart= substr $GenreSplitName, $startLoc, $endLoc - $startLoc;
$startLoc = index $GenreSplitName, " and ";
$endLoc = length($GenreSplitName);
$SecondPart= substr $GenreSplitName, $startLoc + 5, $endLoc - $startLoc -5;
$firstPart =~ s/&/&/g;
$firstPart =~ s/'/'/g;
$firstPart =~ s/"/"/g;
$firstPart =~ s/</g;
$firstPart =~ s/>/>/g;
$SecondPart =~ s/&/&/g;
$SecondPart =~ s/'/'/g;
$SecondPart =~ s/"/"/g;
$SecondPart =~ s/</g;
$SecondPart =~ s/>/>/g;
print MYFILE "" . stringtrim($firstPart) . "" ;
print MYFILE "\n";
print MYFILE "" . stringtrim($SecondPart) . "" ;
print MYFILE "\n";
}
else
{
$GenreSplitName =~ s/&/&/g;
$GenreSplitName =~ s/'/'/g;
$GenreSplitName =~ s/"/"/g;
$GenreSplitName =~ s/</g;
$GenreSplitName =~ s/>/>/g;
print MYFILE "" . stringtrim($GenreSplitName) . "" ;
print MYFILE "\n";
}
}
}
else
{
print MYFILE "" ;
print MYFILE "\n";
}
}
else
{
print MYFILE "" ;
print MYFILE "\n";
}
#print MYFILE "\n";
$RunningTime =~ s/&/&/g;
$RunningTime =~ s/'/'/g;
$RunningTime =~ s/"/"/g;
$RunningTime =~ s/</g;
$RunningTime =~ s/>/>/g;
$ReleaseDate =~ s/&/&/g;
$ReleaseDate =~ s/'/'/g;
$ReleaseDate =~ s/"/"/g;
$ReleaseDate =~ s/</g;
$ReleaseDate =~ s/>/>/g;
$MPAARating =~ s/&/&/g;
$MPAARating =~ s/'/'/g;
$MPAARating =~ s/"/"/g;
$MPAARating =~ s/</g;
$MPAARating =~ s/>/>/g;
$StarringFinalString =~ s/&/&/g;
$StarringFinalString =~ s/'/'/g;
$StarringFinalString =~ s/"/"/g;
$StarringFinalString =~ s/</g;
$StarringFinalString =~ s/>/>/g;
$DirectedByFinalString =~ s/&/&/g;
$DirectedByFinalString =~ s/'/'/g;
$DirectedByFinalString =~ s/"/"/g;
$DirectedByFinalString =~ s/</g;
$DirectedByFinalString =~ s/>/>/g;
$FullDetailsUrl =~ s/&/&/g;
$FullDetailsUrl =~ s/'/'/g;
$FullDetailsUrl =~ s/"/"/g;
$FullDetailsUrl =~ s/</g;
$FullDetailsUrl =~ s/>/>/g;
$CriticsReviewsUrl =~ s/&/&/g;
$CriticsReviewsUrl =~ s/'/'/g;
$CriticsReviewsUrl =~ s/"/"/g;
$CriticsReviewsUrl =~ s/</g;
$CriticsReviewsUrl =~ s/>/>/g;
print MYFILE "" . $RunningTime . "" ;
print MYFILE "\n";
print MYFILE "" . $ReleaseDate . "" ;
print MYFILE "\n";
print MYFILE "" . $MPAARating . "" ;
print MYFILE "\n";
print MYFILE "" . $StarringFinalString . "" ;
print MYFILE "\n";
print MYFILE "" . $DirectedByFinalString . "" ;
print MYFILE "\n";
print MYFILE "" . $FullDetailsUrl . "" ;
print MYFILE "\n";
print MYFILE "" . $CriticsReviewsUrl . "" ;
print MYFILE "\n";
print MYFILE "";
print MYFILE "\n";
#print $MovieName;
#print "\n";
#print $GenreName;
#print "\n";
#print $RunningTime;
#print "\n";
#print $ReleaseDate;
#print "\n";
#print $MPAARating;
#print "\n";
#print $StarringFinalString;
#print "\n";
#print $DirectedByFinalString;
#print "\n";
#print $FullDetailsUrl;
#print "\n";
#print $CriticsReviewsUrl;
#print "\n";
}
sub stringtrim($)
{
my $string = shift;
$string =~ s/^\s+//;
$string =~ s/\s+$//;
return $string;
}