use Digest::SHA1 qw(sha1_hex); use Encode; use File::Path qw/make_path/; use HTML::Parser; use HTML::ResolveLink; use HTTP::Request::Common qw/GET/; use IO::All; use LWP::UserAgent; use URI; my $path = './'; my $uri = URI->new(shift) or die; my $now = DateTime->now; my $ymd = $now->ymd; my $ua = LWP::UserAgent->new(agent => 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)'); my $resolver = HTML::ResolveLink->new(base => $uri); my $res = $ua->request(GET $uri); my $content = $resolver->resolve($res->decoded_content); my $dir = $uri; $dir =~ s{[^A-Za-z0-9.]+}{-}g; $dir =~ s{-+$}{}; $dir = "$path/$dir/$ymd/"; $dir =~ s{/+}{/}g; make_path($dir); my $disallow_tag = qr{script}; my $nodisplay_tag = qr{noscript}; my $result; my $context = { disallow => 0 }; my $parser = HTML::Parser->new( api_version => 3, start_h => [ sub { my($self, $tagname, $attr, $text) = @_; if ($tagname =~ /^(?:$nodisplay_tag)$/i) { return; } elsif ($tagname =~ /^(?:$disallow_tag)$/i) { $context->{disallow}++; return; } $result .= "<$tagname"; for my $key (sort keys %$attr) { $key eq '/' and next; my $value = $attr->{$key}; if ($key =~ /^(?:src)$/i) { $value = get_src($value); } elsif ($tagname =~ /^(?:link)$/i and $key =~ /^(?:href)$/i) { $value = get_link($value); } elsif ($tagname =~ /^(?:base)$/i and $key =~ /^(?:href)$/i) { $value = $path; } $result .= qq{ $key="$value"}; } $result .= ">"; }, 'self,tagname,attr,text', ], end_h => [ sub { my($self, $tagname, $text) = @_; if ($tagname =~ /^(?:$nodisplay_tag)$/i) { return; } elsif ($tagname =~ /^(?:$disallow_tag)$/i) { $context->{disallow}--; return; } $result .= $text; }, 'self,tagname,text', ], default_h => [ sub { my($self, $text) = @_; if ($context->{disallow} > 0) { return; } $result .= $text; }, 'self,text', ], ); $parser->parse($content); $result =~ s{(<head[^>]*>)}{$1<meta http-equiv="Content-Type" content="text/html; charset=utf-8">}i; # XXX $result = Encode::encode('utf-8', $result); $result > io("${dir}index.html"); print "${dir}index.html\n"; sub get_src { my $src = shift or return; unless (-e "${dir}file") { make_path("${dir}file"); } my $file = $src; $file =~ s{[^A-Za-z0-9.]+}{-}g; if (length($file) > 255) { $file = sha1_hex($file); } $file = "file/$file"; $file =~ s{/+}{/}g; unless (-e "$dir$file") { $ua->request(GET $src)->content >> io("$dir$file"); sleep(1); # DOSରࡦରࡦ } $file; } sub get_link { my $url = shift or return; my $file = get_src($url); my $io = io("$dir$file"); my $content = $io->slurp; $content =~ s{url\(([^\)]+)\)}{ my $link = $1; $link =~ s{^[\s\"\']+}{}; $link =~ s{[\s\"\']+$}{}; # relative link (from HTML::ResolveLink) my $u = URI->new($link); unless (defined $u->scheme) { my $old = $u; $u = $u->abs($url); } $link = get_src($u); $link =~ s{^file/}{}; "url($link)"; }eg; $content > $io; return $file; }