#!/usr/bin/perl # Copyright (c) 2007-2008 www.jeganl.com. All rights reserved. # All source code and material located at the Internet address of # http://www.jeganl.com is protected under copyright laws of the United States. # This source code may not be hosted on any other site without my express, prior, # written permission. Application to host any of the material elsewhere can be # made by contacting me. # I have made every effort and taken great care in making sure that the source # code and other content included on my web site is technically accurate, but I # disclaim any and all responsibility for any loss, damage or destruction of # data or any other property which may arise from relying on it. I will in no # case be liable for any monetary damages arising from such loss, damage or # destruction. #print "Content-type: text/html\n\n"; if(eval{require LWP::Simple;}){ }else{ print "You need to install the Perl LWP module!
"; exit; } use LWP::Simple qw($ua get); open (PRG_OUT_FILE, '>prg.out'); #@proxyservers=("200.83.4.60:80","200.181.107.103:80","85.214.126.154:80","66.7.195.129:80","159.145.15.101:80","70.87.7.56:80","208.122.34.234:80","219.87.131.104:80","66.46.148.201:80","192.115.104.88:80"); @proxyservers= ("200.83.4.60:80","80.58.205.61:80","200.65.0.25:80","222.124.193.125:8080","202.105.182.12:80","122.153.173.178:3128","213.114.118.44:8080","85.198.50.2:80","192.146.7.18:80","122.153.173.178:3128"); $page_got=0; #------First Trying to get page source directly $page = get 'http://movies.yahoo.com'; $match_found = index $page, "

Top Box Office

"; if($match_found == -1) { #---- Try using Proxy Servers for ($count = 0; $count < 10; $count++) { if($page_got==0) { print PRG_OUT_FILE "trying proxy:$count $proxyservers[$count] \n"; $ua->timeout(5); $proxyserverurl = "http://" . $proxyservers[$count]; $ua->proxy('http',$proxyserverurl); $page =""; $page = get 'http://movies.yahoo.com'; #print PRG_OUT_FILE $page; $match_found = index $page, "

Top Box Office

"; print PRG_OUT_FILE "match_found : $match_found \n"; if($match_found != -1) { print PRG_OUT_FILE "Match Found : $match_found : proxy:$count $proxyservers[$count] \n"; $page_got=1; } } } } else { print PRG_OUT_FILE "direct Connection Worked \n"; #print "direct Conn"; } #print PRG_OUT_FILE $page; $sleep_timer=1; open (MYFILE, '>../htdocs/hw6/moviedetails.xml'); print MYFILE ""; print MYFILE "\n"; print MYFILE ""; print MYFILE "\n"; #------------------------------------------ Top Box Office Starts-------------------------------------------- print MYFILE ""; print MYFILE "\n"; print MYFILE ""; print MYFILE "\n"; $startLoc = index $page, "

Top Box Office

"; $ind = $startLoc; $endLoc = index lc($page), "", $ind+1; $TopBoxOfficeMovies = substr $page, $startLoc, $endLoc - $startLoc + 5; $startLoc = index lc($TopBoxOfficeMovies), "
    "; $ind = $startLoc; $endLoc = index lc($TopBoxOfficeMovies), "
", $ind+1; $TopBoxOfficeMovies = substr $TopBoxOfficeMovies, $startLoc, $endLoc - $startLoc + 5; #print $TopBoxOfficeMovies; $ind = index lc($TopBoxOfficeMovies), "", $ind+1; $TopBoxOfficeIndividualMovies = substr $TopBoxOfficeMovies, $startLoc, $endLoc - $startLoc + 5; #print $TopBoxOfficeIndividualMovies . "\n"; $startLoc = index lc($TopBoxOfficeIndividualMovies), "http"; $subind = $startLoc; $endLoc = index lc($TopBoxOfficeIndividualMovies), ">", $subind +1; sleep sleep_timer; &getIndividualMovieDetails(substr $TopBoxOfficeIndividualMovies, $startLoc, $endLoc - $startLoc -1); $ind = index lc($TopBoxOfficeMovies), ""; print MYFILE "\n"; print MYFILE "
"; print MYFILE "\n"; #------------------------------------------ Opening this Week Starts-------------------------------------------- print MYFILE ""; print MYFILE "\n"; print MYFILE ""; print MYFILE "\n"; $startLoc = index $page, "

Opening this Week

"; $ind = $startLoc; $endLoc = index lc($page), "", $ind+1; $OpeningthisWeekMovies = substr $page, $startLoc, $endLoc - $startLoc + 5; $startLoc = index lc($OpeningthisWeekMovies), "
    "; $ind = $startLoc; $endLoc = index lc($OpeningthisWeekMovies), "
", $ind+1; $OpeningthisWeekMovies = substr $OpeningthisWeekMovies, $startLoc, $endLoc - $startLoc + 5; $ind = index lc($OpeningthisWeekMovies), "", $ind+1; $OpeningthisWeekIndividualMovies = substr $OpeningthisWeekMovies, $startLoc, $endLoc - $startLoc + 5; $startLoc = index lc($OpeningthisWeekIndividualMovies), "http"; $subind = $startLoc; $endLoc = index lc($OpeningthisWeekIndividualMovies), ""; print MYFILE "\n"; print MYFILE "
"; print MYFILE "\n"; #------------------------------------------ Coming Soon Starts-------------------------------------------- print MYFILE ""; print MYFILE "\n"; print MYFILE ""; print MYFILE "\n"; $startLoc = index $page, "

Coming Soon

"; $ind = $startLoc; $endLoc = index lc($page), "", $ind+1; $ComingSoonMovies = substr $page, $startLoc, $endLoc - $startLoc + 5; $startLoc = index lc($ComingSoonMovies), "
    "; $ind = $startLoc; $endLoc = index lc($ComingSoonMovies), "
", $ind+1; $ComingSoonMovies = substr $ComingSoonMovies, $startLoc, $endLoc - $startLoc + 5; $ind = index lc($ComingSoonMovies), "", $ind+1; $ComingSoonIndividualMovies = substr $ComingSoonMovies, $startLoc, $endLoc - $startLoc + 5; $startLoc = index lc($ComingSoonIndividualMovies), "http"; $subind = $startLoc; $endLoc = index lc($ComingSoonIndividualMovies), ""; print MYFILE "\n"; print MYFILE "
"; print MYFILE "\n"; print MYFILE "
"; print MYFILE "\n"; close (MYFILE); close (PRG_OUT_FILE); #----Navigate to new page $url = "http://csci571.usc.edu:17074/hw6/jwebtech_hw6_2.htm"; print ("Location: $url\n\n"); exit; #------------------------------------SubRoutine Starts getIndividualMovieDetails--------------------------------------------- sub getIndividualMovieDetails { $MovieName=""; $GenreName=""; $RunningTime=""; $ReleaseDate=""; $MPAARating=""; $StarringFinalString=""; $DirectedByFinalString=""; $FullDetailsUrl=""; $CriticsReviewsUrl=""; $IsSpecialMovie=0; $IsGenrePresent=0; $movieURL = "@_"; #print $movieURL; #print "\n"; my $movieDetailPage = get $movieURL; #print $movieDetailPage; $startLoc = index lc($movieDetailPage), ""; $endLoc = index lc($movieDetailPage), ""; $MovieNameSubstring = substr $movieDetailPage, $startLoc, $endLoc - $startLoc; $startLoc = index lc($MovieNameSubstring), ""; $subIndivind = $startLoc; $endLoc = index $MovieNameSubstring, ")", $subIndivind+1; if ($endLoc == -1 ) { #print "\n" ; #print $MovieNameSubstring ; #print "\n" ; $startLocTemp = index lc($MovieNameSubstring), "<title>"; $endLocTemp = index $MovieNameSubstring, "on Yahoo! Movies"; $MovieNameSubstring = substr $MovieNameSubstring, $startLocTemp + 7, $endLocTemp - $startLocTemp -8; $MovieName=$MovieNameSubstring ; } $IsSpecialMovie=$endLoc; if ($IsSpecialMovie != -1) { #------------------------------------MovieName --------------------------------------------- $MovieName = substr $MovieNameSubstring, $startLoc + 7, $endLoc - $startLoc + 1 - 7; #------------------------------------MovieName --------------------------------------------- $startLoc = index $movieDetailPage, "<b>Genres"; if ($startLoc != -1) { $subIndivind = $startLoc; $endLoc = index lc($movieDetailPage), "</tr>", $subIndivind+1; $GenreWholeString = substr $movieDetailPage, $startLoc, $endLoc - $startLoc; $startLoc = index lc($GenreWholeString), "<font"; $subIndivind = $startLoc; $endLoc = index lc($GenreWholeString), "</font>", $subIndivind+1; $GenreSubString1 = substr $GenreWholeString, $startLoc, $endLoc - $startLoc + 1; $startLoc = index lc($GenreSubString1), "<font"; $subIndivind = $startLoc; $endLoc = index lc($GenreSubString1), "</font>", $subIndivind+1; $GenreSubString2 = substr $GenreSubString1, $startLoc, $endLoc - $startLoc; $startLoc = index $GenreSubString2, ">"; $subIndivind = $startLoc; #------------------------------------GenreName --------------------------------------------- $GenreName = substr $GenreSubString2, $startLoc +1, length($GenreSubString2) ; #------------------------------------GenreName --------------------------------------------- } else { $IsGenrePresent=-1; } $startLoc = index $movieDetailPage, "<b>Running Time"; if ($startLoc!= -1) { $subIndivind = $startLoc; $endLoc = index lc($movieDetailPage), "</tr>", $subIndivind+1; $RunningTimeWholeString = substr $movieDetailPage, $startLoc, $endLoc - $startLoc; $startLoc = index lc($RunningTimeWholeString), "<font"; $subIndivind = $startLoc; $endLoc = index lc($RunningTimeWholeString), "</font>", $subIndivind+1; $RunningTimeSubString1 = substr $RunningTimeWholeString, $startLoc, $endLoc - $startLoc +1; $startLoc = index lc($RunningTimeSubString1), "<font"; $subIndivind = $startLoc; $endLoc = index lc($RunningTimeSubString1), "</font>", $subIndivind+1; $RunningTimeSubString2 = substr $RunningTimeSubString1, $startLoc, $endLoc - $startLoc; $startLoc = index $RunningTimeSubString2, ">"; $subIndivind = $startLoc; #------------------------------------RunningTime --------------------------------------------- $RunningTime = substr $RunningTimeSubString2, $startLoc +1, length($RunningTimeSubString2); } #------------------------------------RunningTime --------------------------------------------- $startLoc = index $movieDetailPage, "<b>Release Date"; if ($startLoc != -1) { $subIndivind = $startLoc; $endLoc = index lc($movieDetailPage), "</tr>", $subIndivind+1; $ReleaseDateWholeString = substr $movieDetailPage, $startLoc, $endLoc - $startLoc; $startLoc = index lc($ReleaseDateWholeString), "<font"; $subIndivind = $startLoc; $endLoc = index lc($ReleaseDateWholeString), "</font>", $subIndivind+1; $ReleaseDateSubString1 = substr $ReleaseDateWholeString, $startLoc, $endLoc - $startLoc + 1; $startLoc = index lc($ReleaseDateSubString1), "<font"; $subIndivind = $startLoc; $endLoc = index lc($ReleaseDateSubString1), "</font>", $subIndivind+1; $ReleaseDateSubString2 = substr $ReleaseDateSubString1, $startLoc, $endLoc - $startLoc; $startLoc = index $ReleaseDateSubString2, ">"; $subIndivind = $startLoc; #------------------------------------ReleaseDate --------------------------------------------- $ReleaseDate = substr $ReleaseDateSubString2, $startLoc +1, length($ReleaseDateSubString2); #------------------------------------ReleaseDate --------------------------------------------- } $startLoc = index $movieDetailPage, "<b>MPAA Rating"; if ($startLoc != -1) { $subIndivind = $startLoc; $endLoc = index lc($movieDetailPage), "</tr>", $subIndivind+1; $MPAARatingWholeString = substr $movieDetailPage, $startLoc, $endLoc - $startLoc; $startLoc = index lc($MPAARatingWholeString), "<font"; $subIndivind = $startLoc; $endLoc = index lc($MPAARatingWholeString), "</font>", $subIndivind+1; $MPAARatingSubString1 = substr $MPAARatingWholeString, $startLoc, $endLoc - $startLoc + 1; $startLoc = index lc($MPAARatingSubString1), "<font"; $subIndivind = $startLoc; $endLoc = index lc($MPAARatingSubString1), "</font>", $subIndivind+1; $MPAARatingSubString2 = substr $MPAARatingSubString1, $startLoc, $endLoc - $startLoc; $startLoc = index $MPAARatingSubString2, ">"; $subIndivind = $startLoc; #------------------------------------MPAARating --------------------------------------------- $MPAARating = substr $MPAARatingSubString2, $startLoc +1, length($MPAARatingSubString2); } #------------------------------------MPAARating --------------------------------------------- $startLoc = index $movieDetailPage, "Starring"; if ($startLoc != -1) { $subIndivind = $startLoc; $endLoc = index lc($movieDetailPage), "</tr>", $subIndivind+1; $StarringWholeString = substr $movieDetailPage, $startLoc, $endLoc - $startLoc; $Starringind = index lc($StarringWholeString), "<a"; $i= 1; while($Starringind != -1) { $startLoc = $Starringind; $endLoc = index lc($StarringWholeString), "</a>", $Starringind + 1; $StarringSubString = substr $StarringWholeString, $startLoc, $endLoc - $startLoc ; $startLoc = index $StarringSubString, ">"; if ($i !=1) { $StarringFinalString = $StarringFinalString . "," . substr $StarringSubString, $startLoc +1, length($StarringSubString); } else { $StarringFinalString = substr $StarringSubString, $startLoc +1, length($StarringSubString); } $i=$i+1; $Starringind = index lc($StarringWholeString), "<a", $Starringind+1; } } #------------------------------------Starring --------------------------------------------- $startLoc = index $movieDetailPage, "Directed by"; if ($startLoc != -1) { $subIndivind = $startLoc; $endLoc = index lc($movieDetailPage), "</tr>", $subIndivind+1; $DirectedByWholeString = substr $movieDetailPage, $startLoc, $endLoc - $startLoc; $DirectedByind = index lc($DirectedByWholeString), "<a"; $i= 1; while($DirectedByind != -1) { $startLoc = $DirectedByind; $endLoc = index lc($DirectedByWholeString), "</a>", $DirectedByind + 1; $DirectedBySubString = substr $DirectedByWholeString, $startLoc, $endLoc - $startLoc ; $startLoc = index $DirectedBySubString, ">"; if ($i !=1) { $DirectedByFinalString = $DirectedByFinalString . "," . substr $DirectedBySubString, $startLoc +1, length($DirectedBySubString); } else { $DirectedByFinalString = substr $DirectedBySubString, $startLoc +1, length($DirectedBySubString); } $i=$i+1; $DirectedByind = index lc($DirectedByWholeString), "<a", $DirectedByind+1; } } #------------------------------------DirectedBy --------------------------------------------- $endLoc = index $movieDetailPage, "See Full Details"; if ($endLoc != -1) { $startLoc = index lc($movieDetailPage), "href=", $endLoc - 300; $FullDetailsUrl = substr $movieDetailPage, $startLoc + 6, $endLoc - $startLoc -2 -6; } #------------------------------------FullDetailsUrl --------------------------------------------- $endLoc = index $movieDetailPage, "Critics Reviews"; if ($endLoc != -1) { $startLoc = index lc($movieDetailPage), "href=", $endLoc - 300; if($startLoc < $endLoc) { $endLoc = index $movieDetailPage, ">", $startLoc ; $CriticsReviewsUrl = substr $movieDetailPage, $startLoc + 6, $endLoc - $startLoc -7; } else { $CriticsReviewsUrl = ""; } } #------------------------------------CriticsReviewsUrl --------------------------------------------- #-------------Writing to XML File--------------------------- } $MovieName =~ s/&/&/g; $MovieName =~ s/'/'/g; $MovieName =~ s/"/"/g; $MovieName =~ s/</</g; $MovieName =~ s/>/>/g; print MYFILE "<movie name=\"" . $MovieName . "\">" ; print MYFILE "\n"; #---- To Split GenreName #print MYFILE "<genres>" . $GenreName . "</genres>" ; if ($IsGenrePresent !=-1) { if ($IsSpecialMovie != -1) { @GenreNameArray = split(/,/, $GenreName); foreach $GenreSplitName (@GenreNameArray) { $endLoc = index $GenreSplitName, " and "; if ($endLoc != -1) { $startLoc = 0; $firstPart= substr $GenreSplitName, $startLoc, $endLoc - $startLoc; $startLoc = index $GenreSplitName, " and "; $endLoc = length($GenreSplitName); $SecondPart= substr $GenreSplitName, $startLoc + 5, $endLoc - $startLoc -5; $firstPart =~ s/&/&/g; $firstPart =~ s/'/'/g; $firstPart =~ s/"/"/g; $firstPart =~ s/</</g; $firstPart =~ s/>/>/g; $SecondPart =~ s/&/&/g; $SecondPart =~ s/'/'/g; $SecondPart =~ s/"/"/g; $SecondPart =~ s/</</g; $SecondPart =~ s/>/>/g; print MYFILE "<genres>" . stringtrim($firstPart) . "</genres>" ; print MYFILE "\n"; print MYFILE "<genres>" . stringtrim($SecondPart) . "</genres>" ; print MYFILE "\n"; } else { $GenreSplitName =~ s/&/&/g; $GenreSplitName =~ s/'/'/g; $GenreSplitName =~ s/"/"/g; $GenreSplitName =~ s/</</g; $GenreSplitName =~ s/>/>/g; print MYFILE "<genres>" . stringtrim($GenreSplitName) . "</genres>" ; print MYFILE "\n"; } } } else { print MYFILE "<genres></genres>" ; print MYFILE "\n"; } } else { print MYFILE "<genres></genres>" ; print MYFILE "\n"; } #print MYFILE "\n"; $RunningTime =~ s/&/&/g; $RunningTime =~ s/'/'/g; $RunningTime =~ s/"/"/g; $RunningTime =~ s/</</g; $RunningTime =~ s/>/>/g; $ReleaseDate =~ s/&/&/g; $ReleaseDate =~ s/'/'/g; $ReleaseDate =~ s/"/"/g; $ReleaseDate =~ s/</</g; $ReleaseDate =~ s/>/>/g; $MPAARating =~ s/&/&/g; $MPAARating =~ s/'/'/g; $MPAARating =~ s/"/"/g; $MPAARating =~ s/</</g; $MPAARating =~ s/>/>/g; $StarringFinalString =~ s/&/&/g; $StarringFinalString =~ s/'/'/g; $StarringFinalString =~ s/"/"/g; $StarringFinalString =~ s/</</g; $StarringFinalString =~ s/>/>/g; $DirectedByFinalString =~ s/&/&/g; $DirectedByFinalString =~ s/'/'/g; $DirectedByFinalString =~ s/"/"/g; $DirectedByFinalString =~ s/</</g; $DirectedByFinalString =~ s/>/>/g; $FullDetailsUrl =~ s/&/&/g; $FullDetailsUrl =~ s/'/'/g; $FullDetailsUrl =~ s/"/"/g; $FullDetailsUrl =~ s/</</g; $FullDetailsUrl =~ s/>/>/g; $CriticsReviewsUrl =~ s/&/&/g; $CriticsReviewsUrl =~ s/'/'/g; $CriticsReviewsUrl =~ s/"/"/g; $CriticsReviewsUrl =~ s/</</g; $CriticsReviewsUrl =~ s/>/>/g; print MYFILE "<runningtime>" . $RunningTime . "</runningtime>" ; print MYFILE "\n"; print MYFILE "<releasedate>" . $ReleaseDate . "</releasedate>" ; print MYFILE "\n"; print MYFILE "<mpaarating>" . $MPAARating . "</mpaarating>" ; print MYFILE "\n"; print MYFILE "<starring>" . $StarringFinalString . "</starring>" ; print MYFILE "\n"; print MYFILE "<directedby>" . $DirectedByFinalString . "</directedby>" ; print MYFILE "\n"; print MYFILE "<fulldetails>" . $FullDetailsUrl . "</fulldetails>" ; print MYFILE "\n"; print MYFILE "<reviews>" . $CriticsReviewsUrl . "</reviews>" ; print MYFILE "\n"; print MYFILE "</movie>"; print MYFILE "\n"; #print $MovieName; #print "\n"; #print $GenreName; #print "\n"; #print $RunningTime; #print "\n"; #print $ReleaseDate; #print "\n"; #print $MPAARating; #print "\n"; #print $StarringFinalString; #print "\n"; #print $DirectedByFinalString; #print "\n"; #print $FullDetailsUrl; #print "\n"; #print $CriticsReviewsUrl; #print "\n"; } sub stringtrim($) { my $string = shift; $string =~ s/^\s+//; $string =~ s/\s+$//; return $string; }