Slide 8
Slide 8 text
program WebScrape;
{$APPTYPE CONSOLE}
{.$DEFINE DEBUG}
uses
Classes,
Winsock;
{ Function to connect to host, send HTTP request and
retrieve response }
function DoHTTPGET(const hostName: PAnsiChar; const
resource: PAnsiChar; HTTPResponse: TStrings): Boolean;
const
Port: integer = 80;
CRLF = #13#10; // carriage return/line feed
var
WSAData: TWSAData;
Sock: TSocket;
SockAddrIn: TSockAddrIn;
IPAddress: PHostEnt;
bytesIn: integer;
inBuffer: array [0..1023] of char;
Req: string;
begin
Result := False;
HTTPResponse.Clear;
{ Initialise use of the Windows Sockets DLL.
Older Windows versions support Winsock 1.1 whilst newer
Windows
include Winsock 2 but support 1.1. Therefore, we'll specify
version 1.1 ($101) as being the highest version of Windows
Sockets
that we can use to provide greatest flexibility.
WSAData receives details of the Windows Sockets
implementation }
Winsock.WSAStartUp($101, WSAData);
try
{ Create a socket for TCP/IP usage passing in
Address family spec: AF_INET (TCP, UDP, etc.)
Type specification: SOCK_STREAM
Protocol: IPPROTO_TCP (TCP) }
Sock := WinSock.Socket(AF_INET, SOCK_STREAM,
IPPROTO_TCP);
try
// Check we have a valid socket
if (Sock <> INVALID_SOCKET) then
begin
// Populate socket address structure
with SockAddrIn do
begin
// Address family specification
sin_family := AF_INET;
// Port
sin_port := htons(Port);
// Address
sin_addr.s_addr := inet_addr(hostName);
end;
if (SockAddrIn.sin_addr.s_addr = INADDR_NONE) then
begin
{ As we're using a domain name instead of an
IP Address, we need to resolve the domain name }
IPAddress := Winsock.gethostbyname(hostName);
// Quit if we didn't get an IP Address
if (IPAddress = nil) then
Exit;
// Update the structure with the IP Address
SockAddrIn.sin_addr.s_addr :=
PLongint(IPAddress^.h_addr_list^)^;
end;
// Try to connect to host
if (Winsock.connect(Sock, SockAddrIn, SizeOf(SockAddrIn)) <>
SOCKET_ERROR) then
begin
// OK - Connected
// Compose our request
// Each line of the request must be terminated with a
carriage return/line feed
{ The First line specifies method (e.g. GET, POST), path to
required resource,
and the HTTP version being used. These three fields are
space separated. }
Req := 'GET '+resource+' HTTP/1.1' + CRLF +
// Host: is the only Required header in HTTP 1.1
'Host: '+hostName + CRLF +
{ Persistent connections are the default in HTTP 1.1 but, as
we don't want
or need one for this exercise, we must include the
"Connection: close"
header in our request }
'Connection: close' + CRLF +
CRLF; // Request must end with an empty line!
// Try to send the request to the host
if (Winsock.send(Sock,Req[1],Length(Req),0) <>
SOCKET_ERROR) then
begin
// Initialise incoming data buffer (i.e. fill array with nulls)
FillChar(inBuffer,SizeOf(inBuffer),#0);
// Loop until nothing left to read
repeat
// Read incoming data from socket
bytesIn := Winsock.recv(Sock, inBuffer, SizeOf(inBuffer), 0);
// Assign buffer to Stringlist
HTTPResponse.Text := HTTPResponse.Text +
Copy(string(inBuffer),1,bytesIn);
until
(bytesIn <= 0) or (bytesIn = SOCKET_ERROR);
{ Our list of response strings should
contain at least 1 line }
Result := HTTPResponse.Count > 0;
end;
end;
end;
finally
// Close our socket
Winsock.closesocket(Sock);
end;
finally
{ This causes our application to deregister itself from this
Windows Sockets implementation and allows the
implementation
to free any resources allocated on our behalf. }
Winsock.WSACleanup;
end;
end;
{ Simple function to locate and return the UTC time from the
request sent to http://tycho.usno.navy.mil/cgi-bin/timer.pl
The HTTPResponse parameter contains both the HTTP
Headers and
the HTML served up by the requested resource. }
function ParseResponse(HTTPResponse: TStrings): string;
var
i: Integer;
begin
Result := '';
{ Check first line for server response code
We want something like this: HTTP/1.1 200 OK }
if Pos('200',HTTPResponse[0]) > 0 then
begin
for i := 0 to Pred(HTTPResponse.Count) do
begin
{ The line we're looking for is something like this:
May. 04. 21:55:19 UTC Universal Time }
// Check each line
if Pos('UTC',HTTPResponse[i]) > 0 then
begin
Result :=
Copy(HTTPResponse[i],5,Pos('UTC',HTTPResponse[i])-1);
Break;
end;
end;
end
else
Result := 'HTTP Error: '+HTTPResponse[0];
end;
const
host: PAnsiChar = 'tycho.usno.navy.mil';
res : PAnsiChar = '/cgi-bin/timer.pl';
var
Response: TStrings;
begin
{ A TStringList is a TStrings descendant class
that is used to store and manipulate a list
of strings.
Instantiate a stringlist class to
hold the results of our HTTP GET }
Response := TStringList.Create;
try
// Try an HTTP GET request
if DoHTTPGET(host,res,Response) then
begin
{$IFDEF DEBUG}
{ Write the entire response to
the console window }
Writeln(Response.text);
{$ELSE}
{ Parse the response and write the
result to the console window }
Writeln(ParseResponse(Response));
{$ENDIF DEBUG}
end
else
Writeln('Error retrieving data');
finally
Response.Free;
end;
// Keep console window open
Readln;
end.
http://rosettacode.org/wiki/Web_scraping