资源简介
本程序实现了网络爬虫中爬取网页链接的功能
代码片段和文件信息
#include
#include
#include “deelx.h“
#pragma comment(lib “WS2_32.lib“)
using namespace std;
void main()
{
int err=0;
char sendBuf[200]={0};
char hostName[200]={0};
char host[200]={0} path[200]={0};
char *pHost=NULL;
printf(“请输入网址(例如:www.baidu.com)\n“);
cin>>hostName;
for (pHost = hostName; *pHost != ‘/‘ && *pHost != ‘\0‘; ++pHost);
if ( (int)(pHost - hostName) == strlen(hostName) ) //获得相对地址
strcpy(path “/“);
else
strcpy(path pHost);
*pHost = ‘\0‘;
strcpy(host hostName);//获得主机地址
/*
将要发送的HTTP报文
*/
sprintf(sendBuf“GET “);
strcat(sendBufpath);
strcat(sendBuf“ HTTP/1.1\r\n“);
strcat(sendBuf“HOST: “);
strcat(sendBufhost);
strcat(sendBuf“\r\nConnection: Close\r\n\r\n“);
/*
windows下使用socket必须用WSAStartup初始化,否则不能调用
*/
WORD w = MAKEWORD(20);;
WSADATA data;
err = WSAStartup(w &data);
if(err != 0)
{
cout<<“找不到可用的winsock.dll“< return;
}
SOCKET client=socket(AF_INET SOCK_STREAM 0);//建立socket套接字
if(client==INVALID_SOCKET)
{
cout<<“新建Socket失败!“< return ;
}
/*
将主机域名转化为IP地址
*/
struct hostent *lhost;
lhost = gethostbyname(host);
char *ip = (char *)inet_ntoa(*(struct in_addr *)(lhost->h_addr));
printf(“%s“ip); //输出ip地址
sockaddr_in addr;
addr.sin_family = AF_INET;
addr.sin_port = htons(80);
addr.sin_addr.S_un.S_addr = inet_addr(ip);
if((connect(client(struct sockaddr*)&addrsizeof(struct sockaddr))) ==SOCKET_ERROR)//建立连接
{
cout<<“连接失败,请重试!“< return;
}
send (client sendBuf strlen(sendBuf) 0);//发送HTTP消息
static CRegexpT regexp(“\“[hH][tT]{2}[pP]\\:/{2}.*?(\“)“);//正则表达式
char recvBuf[1024]={0};
char *pRcv = recvBuf;
memset(recvBuf0x00sizeof(recvBuf));
int recvresult = recv(client recvBuf 1024 0);
while(recvresult>0)
{
MatchResult Mr=regexp.Match(recvBuf);//从接收到的网页源码里提取出网站的链接
while(Mr.IsMatched())
{
char resu[200] = “ “;
for (int i=0; i resu[i] = recvBuf[Mr.GetStart()+i];
cout< Mr = regexp.Match(recvBuf Mr.GetEnd());
}
memset(recvBuf0x00sizeof(recvBuf));
recvresult = recv(client recvBuf 1024 0);
}
closesocket(client);
WSACleanup();
}
属性 大小 日期 时间 名称
----------- --------- ---------- ----- ----
文件 2456 2012-11-18 21:55 spider1.cpp
评论
共有 条评论