Slide 1

Slide 1 text

Web Scraping Presented by Garrett Allen BurlingtonJS

Slide 2

Slide 2 text

No content

Slide 3

Slide 3 text

No content

Slide 4

Slide 4 text

No content

Slide 5

Slide 5 text

http://rosettacode.org/wiki/Web_scraping http://tycho.usno.navy.mil/cgi-bin/timer.pl

Slide 6

Slide 6 text

No content

Slide 7

Slide 7 text

No content

Slide 8

Slide 8 text

program WebScrape; {$APPTYPE CONSOLE} {.$DEFINE DEBUG} uses Classes, Winsock; { Function to connect to host, send HTTP request and retrieve response } function DoHTTPGET(const hostName: PAnsiChar; const resource: PAnsiChar; HTTPResponse: TStrings): Boolean; const Port: integer = 80; CRLF = #13#10; // carriage return/line feed var WSAData: TWSAData; Sock: TSocket; SockAddrIn: TSockAddrIn; IPAddress: PHostEnt; bytesIn: integer; inBuffer: array [0..1023] of char; Req: string; begin Result := False; HTTPResponse.Clear; { Initialise use of the Windows Sockets DLL. Older Windows versions support Winsock 1.1 whilst newer Windows include Winsock 2 but support 1.1. Therefore, we'll specify version 1.1 ($101) as being the highest version of Windows Sockets that we can use to provide greatest flexibility. WSAData receives details of the Windows Sockets implementation } Winsock.WSAStartUp($101, WSAData); try { Create a socket for TCP/IP usage passing in Address family spec: AF_INET (TCP, UDP, etc.) Type specification: SOCK_STREAM Protocol: IPPROTO_TCP (TCP) } Sock := WinSock.Socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); try // Check we have a valid socket if (Sock <> INVALID_SOCKET) then begin // Populate socket address structure with SockAddrIn do begin // Address family specification sin_family := AF_INET; // Port sin_port := htons(Port); // Address sin_addr.s_addr := inet_addr(hostName); end; if (SockAddrIn.sin_addr.s_addr = INADDR_NONE) then begin { As we're using a domain name instead of an IP Address, we need to resolve the domain name } IPAddress := Winsock.gethostbyname(hostName); // Quit if we didn't get an IP Address if (IPAddress = nil) then Exit; // Update the structure with the IP Address SockAddrIn.sin_addr.s_addr := PLongint(IPAddress^.h_addr_list^)^; end; // Try to connect to host if (Winsock.connect(Sock, SockAddrIn, SizeOf(SockAddrIn)) <> SOCKET_ERROR) then begin // OK - Connected // Compose our request // Each line of the request must be terminated with a carriage return/line feed { The First line specifies method (e.g. GET, POST), path to required resource, and the HTTP version being used. These three fields are space separated. } Req := 'GET '+resource+' HTTP/1.1' + CRLF + // Host: is the only Required header in HTTP 1.1 'Host: '+hostName + CRLF + { Persistent connections are the default in HTTP 1.1 but, as we don't want or need one for this exercise, we must include the "Connection: close" header in our request } 'Connection: close' + CRLF + CRLF; // Request must end with an empty line! // Try to send the request to the host if (Winsock.send(Sock,Req[1],Length(Req),0) <> SOCKET_ERROR) then begin // Initialise incoming data buffer (i.e. fill array with nulls) FillChar(inBuffer,SizeOf(inBuffer),#0); // Loop until nothing left to read repeat // Read incoming data from socket bytesIn := Winsock.recv(Sock, inBuffer, SizeOf(inBuffer), 0); // Assign buffer to Stringlist HTTPResponse.Text := HTTPResponse.Text + Copy(string(inBuffer),1,bytesIn); until (bytesIn <= 0) or (bytesIn = SOCKET_ERROR); { Our list of response strings should contain at least 1 line } Result := HTTPResponse.Count > 0; end; end; end; finally // Close our socket Winsock.closesocket(Sock); end; finally { This causes our application to deregister itself from this Windows Sockets implementation and allows the implementation to free any resources allocated on our behalf. } Winsock.WSACleanup; end; end; { Simple function to locate and return the UTC time from the request sent to http://tycho.usno.navy.mil/cgi-bin/timer.pl The HTTPResponse parameter contains both the HTTP Headers and the HTML served up by the requested resource. } function ParseResponse(HTTPResponse: TStrings): string; var i: Integer; begin Result := ''; { Check first line for server response code We want something like this: HTTP/1.1 200 OK } if Pos('200',HTTPResponse[0]) > 0 then begin for i := 0 to Pred(HTTPResponse.Count) do begin { The line we're looking for is something like this:
May. 04. 21:55:19 UTC Universal Time } // Check each line if Pos('UTC',HTTPResponse[i]) > 0 then begin Result := Copy(HTTPResponse[i],5,Pos('UTC',HTTPResponse[i])-1); Break; end; end; end else Result := 'HTTP Error: '+HTTPResponse[0]; end; const host: PAnsiChar = 'tycho.usno.navy.mil'; res : PAnsiChar = '/cgi-bin/timer.pl'; var Response: TStrings; begin { A TStringList is a TStrings descendant class that is used to store and manipulate a list of strings. Instantiate a stringlist class to hold the results of our HTTP GET } Response := TStringList.Create; try // Try an HTTP GET request if DoHTTPGET(host,res,Response) then begin {$IFDEF DEBUG} { Write the entire response to the console window } Writeln(Response.text); {$ELSE} { Parse the response and write the result to the console window } Writeln(ParseResponse(Response)); {$ENDIF DEBUG} end else Writeln('Error retrieving data'); finally Response.Free; end; // Keep console window open Readln; end. http://rosettacode.org/wiki/Web_scraping

Slide 9

Slide 9 text

Node.io A distributed data scraping and processing engine for Node.js

Slide 10

Slide 10 text

No content

Slide 11

Slide 11 text

No content

Slide 12

Slide 12 text

$ node.io query "reddit.com/r/javascript" a.title

Slide 13

Slide 13 text

No content

Slide 14

Slide 14 text

No content

Slide 15

Slide 15 text

No content

Slide 16

Slide 16 text

No content

Slide 17

Slide 17 text

No content

Slide 18

Slide 18 text

No content

Slide 19

Slide 19 text

No content

Slide 20

Slide 20 text

No content

Slide 21

Slide 21 text

CasperJS CasperJS is an open source navigation scripting & testing utility written in Javascript and based on PhantomJS — the scriptable headless WebKit engine.

Slide 22

Slide 22 text

http://blogs.adobe.com/webplatform/2013/01/21/getting-started-with-the-webkit-layout-code/

Slide 23

Slide 23 text

http://blogs.adobe.com/webplatform/2013/01/21/getting-started-with-the-webkit-layout-code/

Slide 24

Slide 24 text

No content

Slide 25

Slide 25 text

No content

Slide 26

Slide 26 text

Is CasperJS a Node.js library?

Slide 27

Slide 27 text

No content

Slide 28

Slide 28 text

nope

Slide 29

Slide 29 text

Node.io A distributed data scraping and processing engine for Node.js

Slide 30

Slide 30 text

No content

Slide 31

Slide 31 text

No content

Slide 32

Slide 32 text

No content

Slide 33

Slide 33 text

No content

Slide 34

Slide 34 text

No content

Slide 35

Slide 35 text

No content

Slide 36

Slide 36 text

No content

Slide 37

Slide 37 text

No content

Slide 38

Slide 38 text

No content

Slide 39

Slide 39 text

No content

Slide 40

Slide 40 text

No content

Slide 41

Slide 41 text

No content

Slide 42

Slide 42 text

No content

Slide 43

Slide 43 text

No content

Slide 44

Slide 44 text

Natural Language Processing = Scraping – pain + other pain

Slide 45

Slide 45 text

Scraper is coming…

Slide 46

Slide 46 text

No content