Upgrade to Pro — share decks privately, control downloads, hide ads and more …

wget.pl

Sponsored · Your Podcast. Everywhere. Effortlessly. Share. Educate. Inspire. Entertain. You do you. We'll handle the rest.
Avatar for Yasuhiro Onishi Yasuhiro Onishi
November 16, 2012
1.5k

 wget.pl

Avatar for Yasuhiro Onishi

Yasuhiro Onishi

November 16, 2012
Tweet

Transcript

  1. )5.-1BSTFS my $result; my $parser = HTML::Parser->new( start_h => [

    sub {}, 'self,tagname,attr,text' ], default_h => [ sub {}, 'self,text' ], ); $parser->parse($content); print $result; w text w start w end w process w declaration w comment w default
  2. )5.-1BSTFS start_h => [ sub { my($self, $tagname, $attr, $text)

    = @_; $result .= "<$tagname"; for my $key (sort keys %$attr) { my $value = $attr->{$key}; if ($key =~ /^(?:src)$/i) { # HTTP GET ͯ͠อଘͯ͠ϩʔΧϧύεʹ͢Δ $value = get_src($value); } $result .= qq{ $key="$value"}; } $result .= ">"; }, 'self,tagname,attr,text', ],
  3. )5.-1BSTFS default_h => [ sub { my($self, $text) = @_;

    $result .= $text; }, 'self,text', ],
  4. ׬

  5. $44͔Βࢀর $content =~ s{url\(([^\)]+)\)}{ my $link = $1; # relative

    link (from HTML::ResolveLink) my $u = URI->new($link); unless (defined $u->scheme) { my $old = $u; $u = $u->abs($url); } $link = get_src($u); # HTTP GET ͯ͠อଘͯ͠ϩʔΧϧύεʹ "url($link)"; }eg;
  6. TDSJQUࡴ͢ my $context = { disallow => 0 }; my

    $disallow_tag = qr{script}; start_h => [sub { if ($tagname =~ /^(?:$disallow_tag)$/i) { $context->{disallow}++; return; } }], end_h => [sub { if ($tagname =~ /^(?:$disallow_tag)$/i) { $context->{disallow}--; return; } }], default_h => [sub { if ($context->{disallow} > 0) { return; } }],
  7. OPTDSJQU಺Λੜ͔͢ my $nodisplay_tag = qr{noscript}; start_h => [sub { if

    ($tagname =~ /^(?:$nodisplay_tag)$/i) { return; } }], end_h => [sub { if ($tagname =~ /^(?:$nodisplay_tag)$/i) { return; } }],
  8. CBTF start_h => [sub { if ($tagname =~ /^(?:base)$/i and

    $key =~ /^(?:href)$/i) { $value = "./"; } }],
  9. #!/usr/bin/env perl use strict; use warnings; use utf8; use DateTime;

    use Digest::SHA1 qw(sha1_hex); use Encode; use File::Path qw/make_path/; use HTML::Parser; use HTML::ResolveLink; use HTTP::Request::Common qw/GET/; use IO::All; use LWP::UserAgent; use URI; my $path = './'; my $uri = URI->new(shift) or die; my $now = DateTime->now; my $ymd = $now->ymd; my $ua = LWP::UserAgent->new(agent => 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)'); my $resolver = HTML::ResolveLink->new(base => $uri); my $res = $ua->request(GET $uri); my $content = $resolver->resolve($res->decoded_content); my $dir = $uri; $dir =~ s{[^A-Za-z0-9.]+}{-}g; $dir =~ s{-+$}{}; $dir = "$path/$dir/$ymd/"; $dir =~ s{/+}{/}g; make_path($dir); my $disallow_tag = qr{script}; my $nodisplay_tag = qr{noscript}; my $result; my $context = { disallow => 0 }; my $parser = HTML::Parser->new( api_version => 3, start_h => [ sub { my($self, $tagname, $attr, $text) = @_; if ($tagname =~ /^(?:$nodisplay_tag)$/i) { return; } elsif ($tagname =~ /^(?:$disallow_tag)$/i) { $context->{disallow}++; return; } $result .= "<$tagname"; for my $key (sort keys %$attr) { $key eq '/' and next; my $value = $attr->{$key}; if ($key =~ /^(?:src)$/i) { $value = get_src($value); } elsif ($tagname =~ /^(?:link)$/i and $key =~ /^(?:href)$/i) { $value = get_link($value); } elsif ($tagname =~ /^(?:base)$/i and $key =~ /^(?:href)$/i) { $value = $path; } $result .= qq{ $key="$value"}; } $result .= ">"; }, 'self,tagname,attr,text', ], end_h => [ sub { my($self, $tagname, $text) = @_; if ($tagname =~ /^(?:$nodisplay_tag)$/i) { return; } elsif ($tagname =~ /^(?:$disallow_tag)$/i) { $context->{disallow}--; return; } $result .= $text; }, 'self,tagname,text', ], default_h => [ sub { my($self, $text) = @_; if ($context->{disallow} > 0) { return; } $result .= $text; }, 'self,text', ], ); $parser->parse($content); $result =~ s{(<head[^>]*>)}{$1<meta http-equiv="Content-Type" content="text/html; charset=utf-8">}i; # XXX $result = Encode::encode('utf-8', $result); $result > io("${dir}index.html"); print "${dir}index.html\n"; sub get_src { my $src = shift or return; unless (-e "${dir}file") { make_path("${dir}file"); } my $file = $src; $file =~ s{[^A-Za-z0-9.]+}{-}g; if (length($file) > 255) { $file = sha1_hex($file); } $file = "file/$file"; $file =~ s{/+}{/}g; unless (-e "$dir$file") { $ua->request(GET $src)->content >> io("$dir$file"); sleep(1); # DOSରࡦରࡦ } $file; } sub get_link { my $url = shift or return; my $file = get_src($url); my $io = io("$dir$file"); my $content = $io->slurp; $content =~ s{url\(([^\)]+)\)}{ my $link = $1; $link =~ s{^[\s\"\']+}{}; $link =~ s{[\s\"\']+$}{}; # relative link (from HTML::ResolveLink) my $u = URI->new($link); unless (defined $u->scheme) { my $old = $u; $u = $u->abs($url); } $link = get_src($u); $link =~ s{^file/}{}; "url($link)"; }eg; $content > $io; return $file; }