Upgrade to Pro — share decks privately, control downloads, hide ads and more …

wget.pl

Yasuhiro Onishi
November 16, 2012
1.3k

 wget.pl

Yasuhiro Onishi

November 16, 2012
Tweet

Transcript

  1. All YOUR PAGE ARE BELONG TO US
    ͢΂ͯͷ΢ΣϒϖʔδΛ͜ͷखʹ
    2012/11/16
    גࣜձࣾ͸ͯͳ
    େ੢߁༟JEPOJTIJ

    View Slide

  2. ɹJEPOJTIJେ੢߁༟
    ɹ0/*4)*
    ɹ[email protected]
    ɹגࣜձࣾ͸ͯͳ
    ɹ͸ͯͳϒϩά

    View Slide

  3. ΢ΣϒϖʔδΛ
    อଘ͍ͨ͠

    View Slide

  4. w΢Σϒϖʔδ͸೔ʑมԽ͢Δ
    wखݩʹஔ͍͓͖͍ͯͨ
    wڝ߹ௐࠪ
    wڕ୓
    wը૾ͳͲ·ͱΊͯอଘ͍ͨ͠
    ΢ΣϒϖʔδΛอଘ͍ͨ͠

    View Slide

  5. (PPHMF
    $ISPNF

    View Slide

  6. View Slide

  7. )5.-1BSTFS
    my $result;
    my $parser = HTML::Parser->new(
    start_h => [ sub {}, 'self,tagname,attr,text' ],
    default_h => [ sub {}, 'self,text' ],
    );
    $parser->parse($content);
    print $result;
    w text
    w start
    w end
    w process
    w declaration
    w comment
    w default

    View Slide

  8. )5.-1BSTFS
    start_h => [
    sub {
    my($self, $tagname, $attr, $text) = @_;
    $result .= "<$tagname";
    for my $key (sort keys %$attr) {
    my $value = $attr->{$key};
    if ($key =~ /^(?:src)$/i) {
    # HTTP GET ͯ͠อଘͯ͠ϩʔΧϧύεʹ͢Δ
    $value = get_src($value);
    }
    $result .= qq{ $key="$value"};
    }
    $result .= ">";
    },
    'self,tagname,attr,text',
    ],

    View Slide

  9. )5.-1BSTFS
    default_h => [
    sub {
    my($self, $text) = @_;
    $result .= $text;
    },
    'self,text',
    ],

    View Slide

  10. ׬

    View Slide

  11. View Slide

  12. $44͔Βࢀর
    $content =~ s{url\(([^\)]+)\)}{
    my $link = $1;
    # relative link (from HTML::ResolveLink)
    my $u = URI->new($link);
    unless (defined $u->scheme) {
    my $old = $u;
    $u = $u->abs($url);
    }
    $link = get_src($u); # HTTP GET ͯ͠อଘͯ͠ϩʔΧϧύεʹ
    "url($link)";
    }eg;

    View Slide

  13. TDSJQUࡴ͢
    my $context = { disallow => 0 };
    my $disallow_tag = qr{script};
    start_h => [sub {
    if ($tagname =~ /^(?:$disallow_tag)$/i) {
    $context->{disallow}++; return;
    }
    }],
    end_h => [sub {
    if ($tagname =~ /^(?:$disallow_tag)$/i) {
    $context->{disallow}--; return;
    }
    }],
    default_h => [sub {
    if ($context->{disallow} > 0) {
    return;
    }
    }],

    View Slide

  14. OPTDSJQU಺Λੜ͔͢
    my $nodisplay_tag = qr{noscript};
    start_h => [sub {
    if ($tagname =~ /^(?:$nodisplay_tag)$/i) {
    return;
    }
    }],
    end_h => [sub {
    if ($tagname =~ /^(?:$nodisplay_tag)$/i) {
    return;
    }
    }],

    View Slide

  15. CBTF
    start_h => [sub {
    if ($tagname =~ /^(?:base)$/i and $key =~ /^(?:href)$/i) {
    $value = "./";
    }
    }],

    View Slide

  16. Ͱ͖·ͨ͠ʂ
    HJTUHJUIVCDPN

    View Slide

  17. #!/usr/bin/env perl
    use strict;
    use warnings;
    use utf8;
    use DateTime;
    use Digest::SHA1 qw(sha1_hex);
    use Encode;
    use File::Path qw/make_path/;
    use HTML::Parser;
    use HTML::ResolveLink;
    use HTTP::Request::Common qw/GET/;
    use IO::All;
    use LWP::UserAgent;
    use URI;
    my $path = './';
    my $uri = URI->new(shift) or die;
    my $now = DateTime->now;
    my $ymd = $now->ymd;
    my $ua = LWP::UserAgent->new(agent => 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)');
    my $resolver = HTML::ResolveLink->new(base => $uri);
    my $res = $ua->request(GET $uri);
    my $content = $resolver->resolve($res->decoded_content);
    my $dir = $uri;
    $dir =~ s{[^A-Za-z0-9.]+}{-}g;
    $dir =~ s{-+$}{};
    $dir = "$path/$dir/$ymd/";
    $dir =~ s{/+}{/}g;
    make_path($dir);
    my $disallow_tag = qr{script};
    my $nodisplay_tag = qr{noscript};
    my $result;
    my $context = { disallow => 0 };
    my $parser = HTML::Parser->new(
    api_version => 3,
    start_h => [
    sub {
    my($self, $tagname, $attr, $text) = @_;
    if ($tagname =~ /^(?:$nodisplay_tag)$/i) {
    return;
    } elsif ($tagname =~ /^(?:$disallow_tag)$/i) {
    $context->{disallow}++;
    return;
    }
    $result .= "<$tagname";
    for my $key (sort keys %$attr) {
    $key eq '/' and next;
    my $value = $attr->{$key};
    if ($key =~ /^(?:src)$/i) {
    $value = get_src($value);
    } elsif ($tagname =~ /^(?:link)$/i and $key =~ /^(?:href)$/i) {
    $value = get_link($value);
    } elsif ($tagname =~ /^(?:base)$/i and $key =~ /^(?:href)$/i) {
    $value = $path;
    }
    $result .= qq{ $key="$value"};
    }
    $result .= ">";
    },
    'self,tagname,attr,text',
    ],
    end_h => [
    sub {
    my($self, $tagname, $text) = @_;
    if ($tagname =~ /^(?:$nodisplay_tag)$/i) {
    return;
    } elsif ($tagname =~ /^(?:$disallow_tag)$/i) {
    $context->{disallow}--;
    return;
    }
    $result .= $text;
    },
    'self,tagname,text',
    ],
    default_h => [
    sub {
    my($self, $text) = @_;
    if ($context->{disallow} > 0) {
    return;
    }
    $result .= $text;
    },
    'self,text',
    ],
    );
    $parser->parse($content);
    $result =~ s{(]*>)}{$1}i; # XXX
    $result = Encode::encode('utf-8', $result);
    $result > io("${dir}index.html");
    print "${dir}index.html\n";
    sub get_src {
    my $src = shift or return;
    unless (-e "${dir}file") {
    make_path("${dir}file");
    }
    my $file = $src;
    $file =~ s{[^A-Za-z0-9.]+}{-}g;
    if (length($file) > 255) {
    $file = sha1_hex($file);
    }
    $file = "file/$file";
    $file =~ s{/+}{/}g;
    unless (-e "$dir$file") {
    $ua->request(GET $src)->content >> io("$dir$file");
    sleep(1); # DOSରࡦରࡦ
    }
    $file;
    }
    sub get_link {
    my $url = shift or return;
    my $file = get_src($url);
    my $io = io("$dir$file");
    my $content = $io->slurp;
    $content =~ s{url\(([^\)]+)\)}{
    my $link = $1;
    $link =~ s{^[\s\"\']+}{};
    $link =~ s{[\s\"\']+$}{};
    # relative link (from HTML::ResolveLink)
    my $u = URI->new($link);
    unless (defined $u->scheme) {
    my $old = $u;
    $u = $u->abs($url);
    }
    $link = get_src($u);
    $link =~ s{^file/}{};
    "url($link)";
    }eg;
    $content > $io;
    return $file;
    }

    View Slide

  18. (PPHMF
    $ISPNF

    View Slide

  19. View Slide

  20. XHFUQM

    View Slide

  21. View Slide

  22. Ͳ͏ͧ͝ར༻͍ͩ͘͞ʂ
    HJTUHJUIVCDPN

    View Slide

  23. ͝ਗ਼ௌ͋Γ͕ͱ͏͍͟͝·ͨ͠

    View Slide