Upgrade to Pro — share decks privately, control downloads, hide ads and more …

Web Scraping with JavaScript

Web Scraping with JavaScript

Given 6/19/2013 at BurlingtonJS at Dealter.com KB Theater, Burlington, Vermont. The Game of Thrones images were presented as super sweet animated .gifs, but alas .pdf supports no such sorcery.

A4e9d2559893acd010a92524048402ff?s=128

garrett allen

June 19, 2013
Tweet

Transcript

  1. Web Scraping Presented by Garrett Allen BurlingtonJS

  2. None
  3. None
  4. None
  5. http://rosettacode.org/wiki/Web_scraping http://tycho.usno.navy.mil/cgi-bin/timer.pl

  6. None
  7. None
  8. program WebScrape; {$APPTYPE CONSOLE} {.$DEFINE DEBUG} uses Classes, Winsock; {

    Function to connect to host, send HTTP request and retrieve response } function DoHTTPGET(const hostName: PAnsiChar; const resource: PAnsiChar; HTTPResponse: TStrings): Boolean; const Port: integer = 80; CRLF = #13#10; // carriage return/line feed var WSAData: TWSAData; Sock: TSocket; SockAddrIn: TSockAddrIn; IPAddress: PHostEnt; bytesIn: integer; inBuffer: array [0..1023] of char; Req: string; begin Result := False; HTTPResponse.Clear; { Initialise use of the Windows Sockets DLL. Older Windows versions support Winsock 1.1 whilst newer Windows include Winsock 2 but support 1.1. Therefore, we'll specify version 1.1 ($101) as being the highest version of Windows Sockets that we can use to provide greatest flexibility. WSAData receives details of the Windows Sockets implementation } Winsock.WSAStartUp($101, WSAData); try { Create a socket for TCP/IP usage passing in Address family spec: AF_INET (TCP, UDP, etc.) Type specification: SOCK_STREAM Protocol: IPPROTO_TCP (TCP) } Sock := WinSock.Socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); try // Check we have a valid socket if (Sock <> INVALID_SOCKET) then begin // Populate socket address structure with SockAddrIn do begin // Address family specification sin_family := AF_INET; // Port sin_port := htons(Port); // Address sin_addr.s_addr := inet_addr(hostName); end; if (SockAddrIn.sin_addr.s_addr = INADDR_NONE) then begin { As we're using a domain name instead of an IP Address, we need to resolve the domain name } IPAddress := Winsock.gethostbyname(hostName); // Quit if we didn't get an IP Address if (IPAddress = nil) then Exit; // Update the structure with the IP Address SockAddrIn.sin_addr.s_addr := PLongint(IPAddress^.h_addr_list^)^; end; // Try to connect to host if (Winsock.connect(Sock, SockAddrIn, SizeOf(SockAddrIn)) <> SOCKET_ERROR) then begin // OK - Connected // Compose our request // Each line of the request must be terminated with a carriage return/line feed { The First line specifies method (e.g. GET, POST), path to required resource, and the HTTP version being used. These three fields are space separated. } Req := 'GET '+resource+' HTTP/1.1' + CRLF + // Host: is the only Required header in HTTP 1.1 'Host: '+hostName + CRLF + { Persistent connections are the default in HTTP 1.1 but, as we don't want or need one for this exercise, we must include the "Connection: close" header in our request } 'Connection: close' + CRLF + CRLF; // Request must end with an empty line! // Try to send the request to the host if (Winsock.send(Sock,Req[1],Length(Req),0) <> SOCKET_ERROR) then begin // Initialise incoming data buffer (i.e. fill array with nulls) FillChar(inBuffer,SizeOf(inBuffer),#0); // Loop until nothing left to read repeat // Read incoming data from socket bytesIn := Winsock.recv(Sock, inBuffer, SizeOf(inBuffer), 0); // Assign buffer to Stringlist HTTPResponse.Text := HTTPResponse.Text + Copy(string(inBuffer),1,bytesIn); until (bytesIn <= 0) or (bytesIn = SOCKET_ERROR); { Our list of response strings should contain at least 1 line } Result := HTTPResponse.Count > 0; end; end; end; finally // Close our socket Winsock.closesocket(Sock); end; finally { This causes our application to deregister itself from this Windows Sockets implementation and allows the implementation to free any resources allocated on our behalf. } Winsock.WSACleanup; end; end; { Simple function to locate and return the UTC time from the request sent to http://tycho.usno.navy.mil/cgi-bin/timer.pl The HTTPResponse parameter contains both the HTTP Headers and the HTML served up by the requested resource. } function ParseResponse(HTTPResponse: TStrings): string; var i: Integer; begin Result := ''; { Check first line for server response code We want something like this: HTTP/1.1 200 OK } if Pos('200',HTTPResponse[0]) > 0 then begin for i := 0 to Pred(HTTPResponse.Count) do begin { The line we're looking for is something like this: <BR>May. 04. 21:55:19 UTC Universal Time } // Check each line if Pos('UTC',HTTPResponse[i]) > 0 then begin Result := Copy(HTTPResponse[i],5,Pos('UTC',HTTPResponse[i])-1); Break; end; end; end else Result := 'HTTP Error: '+HTTPResponse[0]; end; const host: PAnsiChar = 'tycho.usno.navy.mil'; res : PAnsiChar = '/cgi-bin/timer.pl'; var Response: TStrings; begin { A TStringList is a TStrings descendant class that is used to store and manipulate a list of strings. Instantiate a stringlist class to hold the results of our HTTP GET } Response := TStringList.Create; try // Try an HTTP GET request if DoHTTPGET(host,res,Response) then begin {$IFDEF DEBUG} { Write the entire response to the console window } Writeln(Response.text); {$ELSE} { Parse the response and write the result to the console window } Writeln(ParseResponse(Response)); {$ENDIF DEBUG} end else Writeln('Error retrieving data'); finally Response.Free; end; // Keep console window open Readln; end. http://rosettacode.org/wiki/Web_scraping
  9. Node.io A distributed data scraping and processing engine for Node.js

  10. None
  11. None
  12. $ node.io query "reddit.com/r/javascript" a.title

  13. None
  14. None
  15. None
  16. None
  17. None
  18. None
  19. None
  20. None
  21. CasperJS CasperJS is an open source navigation scripting & testing

    utility written in Javascript and based on PhantomJS — the scriptable headless WebKit engine.
  22. http://blogs.adobe.com/webplatform/2013/01/21/getting-started-with-the-webkit-layout-code/

  23. http://blogs.adobe.com/webplatform/2013/01/21/getting-started-with-the-webkit-layout-code/

  24. None
  25. None
  26. Is CasperJS a Node.js library?

  27. None
  28. nope

  29. Node.io A distributed data scraping and processing engine for Node.js

  30. None
  31. None
  32. None
  33. None
  34. None
  35. None
  36. None
  37. None
  38. None
  39. None
  40. None
  41. None
  42. None
  43. None
  44. Natural Language Processing = Scraping – pain + other pain

  45. Scraper is coming…

  46. None