Slide 1

Slide 1 text

All YOUR PAGE ARE BELONG TO US ͢΂ͯͷ΢ΣϒϖʔδΛ͜ͷखʹ 2012/11/16 גࣜձࣾ͸ͯͳ େ੢߁༟JEPOJTIJ

Slide 2

Slide 2 text

ɹJEPOJTIJେ੢߁༟ ɹ0/*4)* ɹ!ZBTVIJSP@POJTIJ ɹגࣜձࣾ͸ͯͳ ɹ͸ͯͳϒϩά

Slide 3

Slide 3 text

΢ΣϒϖʔδΛ อଘ͍ͨ͠

Slide 4

Slide 4 text

w΢Σϒϖʔδ͸೔ʑมԽ͢Δ wखݩʹஔ͍͓͖͍ͯͨ wڝ߹ௐࠪ wڕ୓ wը૾ͳͲ·ͱΊͯอଘ͍ͨ͠ ΢ΣϒϖʔδΛอଘ͍ͨ͠

Slide 5

Slide 5 text

(PPHMF $ISPNF

Slide 6

Slide 6 text

No content

Slide 7

Slide 7 text

)5.-1BSTFS my $result; my $parser = HTML::Parser->new( start_h => [ sub {}, 'self,tagname,attr,text' ], default_h => [ sub {}, 'self,text' ], ); $parser->parse($content); print $result; w text w start w end w process w declaration w comment w default

Slide 8

Slide 8 text

)5.-1BSTFS start_h => [ sub { my($self, $tagname, $attr, $text) = @_; $result .= "<$tagname"; for my $key (sort keys %$attr) { my $value = $attr->{$key}; if ($key =~ /^(?:src)$/i) { # HTTP GET ͯ͠อଘͯ͠ϩʔΧϧύεʹ͢Δ $value = get_src($value); } $result .= qq{ $key="$value"}; } $result .= ">"; }, 'self,tagname,attr,text', ],

Slide 9

Slide 9 text

)5.-1BSTFS default_h => [ sub { my($self, $text) = @_; $result .= $text; }, 'self,text', ],

Slide 10

Slide 10 text

׬

Slide 11

Slide 11 text

No content

Slide 12

Slide 12 text

$44͔Βࢀর $content =~ s{url\(([^\)]+)\)}{ my $link = $1; # relative link (from HTML::ResolveLink) my $u = URI->new($link); unless (defined $u->scheme) { my $old = $u; $u = $u->abs($url); } $link = get_src($u); # HTTP GET ͯ͠อଘͯ͠ϩʔΧϧύεʹ "url($link)"; }eg;

Slide 13

Slide 13 text

TDSJQUࡴ͢ my $context = { disallow => 0 }; my $disallow_tag = qr{script}; start_h => [sub { if ($tagname =~ /^(?:$disallow_tag)$/i) { $context->{disallow}++; return; } }], end_h => [sub { if ($tagname =~ /^(?:$disallow_tag)$/i) { $context->{disallow}--; return; } }], default_h => [sub { if ($context->{disallow} > 0) { return; } }],

Slide 14

Slide 14 text

OPTDSJQU಺Λੜ͔͢ my $nodisplay_tag = qr{noscript}; start_h => [sub { if ($tagname =~ /^(?:$nodisplay_tag)$/i) { return; } }], end_h => [sub { if ($tagname =~ /^(?:$nodisplay_tag)$/i) { return; } }],

Slide 15

Slide 15 text

CBTF start_h => [sub { if ($tagname =~ /^(?:base)$/i and $key =~ /^(?:href)$/i) { $value = "./"; } }],

Slide 16

Slide 16 text

Ͱ͖·ͨ͠ʂ HJTUHJUIVCDPN

Slide 17

Slide 17 text

#!/usr/bin/env perl use strict; use warnings; use utf8; use DateTime; use Digest::SHA1 qw(sha1_hex); use Encode; use File::Path qw/make_path/; use HTML::Parser; use HTML::ResolveLink; use HTTP::Request::Common qw/GET/; use IO::All; use LWP::UserAgent; use URI; my $path = './'; my $uri = URI->new(shift) or die; my $now = DateTime->now; my $ymd = $now->ymd; my $ua = LWP::UserAgent->new(agent => 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)'); my $resolver = HTML::ResolveLink->new(base => $uri); my $res = $ua->request(GET $uri); my $content = $resolver->resolve($res->decoded_content); my $dir = $uri; $dir =~ s{[^A-Za-z0-9.]+}{-}g; $dir =~ s{-+$}{}; $dir = "$path/$dir/$ymd/"; $dir =~ s{/+}{/}g; make_path($dir); my $disallow_tag = qr{script}; my $nodisplay_tag = qr{noscript}; my $result; my $context = { disallow => 0 }; my $parser = HTML::Parser->new( api_version => 3, start_h => [ sub { my($self, $tagname, $attr, $text) = @_; if ($tagname =~ /^(?:$nodisplay_tag)$/i) { return; } elsif ($tagname =~ /^(?:$disallow_tag)$/i) { $context->{disallow}++; return; } $result .= "<$tagname"; for my $key (sort keys %$attr) { $key eq '/' and next; my $value = $attr->{$key}; if ($key =~ /^(?:src)$/i) { $value = get_src($value); } elsif ($tagname =~ /^(?:link)$/i and $key =~ /^(?:href)$/i) { $value = get_link($value); } elsif ($tagname =~ /^(?:base)$/i and $key =~ /^(?:href)$/i) { $value = $path; } $result .= qq{ $key="$value"}; } $result .= ">"; }, 'self,tagname,attr,text', ], end_h => [ sub { my($self, $tagname, $text) = @_; if ($tagname =~ /^(?:$nodisplay_tag)$/i) { return; } elsif ($tagname =~ /^(?:$disallow_tag)$/i) { $context->{disallow}--; return; } $result .= $text; }, 'self,tagname,text', ], default_h => [ sub { my($self, $text) = @_; if ($context->{disallow} > 0) { return; } $result .= $text; }, 'self,text', ], ); $parser->parse($content); $result =~ s{(]*>)}{$1}i; # XXX $result = Encode::encode('utf-8', $result); $result > io("${dir}index.html"); print "${dir}index.html\n"; sub get_src { my $src = shift or return; unless (-e "${dir}file") { make_path("${dir}file"); } my $file = $src; $file =~ s{[^A-Za-z0-9.]+}{-}g; if (length($file) > 255) { $file = sha1_hex($file); } $file = "file/$file"; $file =~ s{/+}{/}g; unless (-e "$dir$file") { $ua->request(GET $src)->content >> io("$dir$file"); sleep(1); # DOSରࡦରࡦ } $file; } sub get_link { my $url = shift or return; my $file = get_src($url); my $io = io("$dir$file"); my $content = $io->slurp; $content =~ s{url\(([^\)]+)\)}{ my $link = $1; $link =~ s{^[\s\"\']+}{}; $link =~ s{[\s\"\']+$}{}; # relative link (from HTML::ResolveLink) my $u = URI->new($link); unless (defined $u->scheme) { my $old = $u; $u = $u->abs($url); } $link = get_src($u); $link =~ s{^file/}{}; "url($link)"; }eg; $content > $io; return $file; }

Slide 18

Slide 18 text

(PPHMF $ISPNF

Slide 19

Slide 19 text

No content

Slide 20

Slide 20 text

XHFUQM

Slide 21

Slide 21 text

No content

Slide 22

Slide 22 text

Ͳ͏ͧ͝ར༻͍ͩ͘͞ʂ HJTUHJUIVCDPN

Slide 23

Slide 23 text

͝ਗ਼ௌ͋Γ͕ͱ͏͍͟͝·ͨ͠