検索結果のURL先のテキストをすべて読み込む(修正版)[Perl]
というわけで、修正版です。
これで、「静岡」という検索結果のURL先のテキストをすべて抜き出せます。
スパゲッティ的ではありますが、実現が第一ということで・・・。
#!/usr/bin/perl
use strict;
use warnings;
use LWP::UserAgent;
use HTML::TreeBuilder;
use Jcode;
# urlを指定する
# my $url = 'http://www.yahoo.co.jp';
my $search_word = '静岡';
# UTF8
#$search_word = jcode($search_word,'euc')->utf8;
# URLエンコード
$search_word =~ s/(\W)/'%' . unpack('H2', $1)/eg;
my $url = 'http://www.google.co.jp/search?hl=ja&q=' . $search_word . '&lr=';
# IE8
my $user_agent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)";
# HTMLを取得
my $ua = LWP::UserAgent->new('agent' => $user_agent);
my $res = $ua->get($url);
my $content = $res->content;
# HTML::TreeBuilderで解析
my $tree = HTML::TreeBuilder->new;
$tree->parse($content);
# DOM操作
my @items = $tree->find('h3');
my @a;
my @urls;
for (@items){
print $_->as_text."\n";
@a=$_->find('a');
for ( @a ){
# print $_->attr('href')."\n";
my $url = $_->attr('href');
$url = urlget($url);
print $url."\n";
push( @urls , $url )if chkUrl($url);
}
}
# 一覧の詳細
@items = $tree->look_down('class', 'st');
print $_->as_text."\n" for @items;
#URLを1つずつ開く.
for (@urls){
my $url=$_;
print $url . " ...get\n";
my $res = $ua->get($url);
my $content = $res->content;
my $code = Jcode::getcode($content);
if ( $code ne 'utf8' ){
$content = Jcode::convert( $res->content ,"utf8" , $code);
}
$content=~ s/&/\xef\xbc\x86/g;
$content=~ s/&/&/g;
$tree->parse($content);
my @items = $tree->find('html');
print $_->as_text."\n" for @items;
}
sub urlget{
my $url=shift;
if ( $url=~ /\?url\=(.*)\&rct/ ){
return $1;
}
else{
#return $url;
return "";
}
}
sub chkUrl{
my $url=shift;
if ( $url=~ /wikipedia\.org/ ){
$url="";
}
return $url;
}
ディスカッション
コメント一覧
まだ、コメントがありません