shell html(Linux获取网页源码的几种方法 遗世之都 ITeye技术网站)
导读:JavaEye博客还是本科做毕业设计时候开通的,基本上荒废了,现在决定记录下平时编程遇到的问题或者解决方案。...
JavaEye博客还是本科做毕业设计时候开通的 ,基本上荒废了 ,现在决定记录下平时编程遇到的问题或者解决方案 。
//通过Wget来获取网页
stringGetHtmlByWget(stringurl)
{
//获取待下载网页文件名
stringfileName=url.substr((int)url.find_last_of("/")+1);
if(fileName!="")
{
stringstrCom="wget-q";//wget命令 ,-q表示不显示下载信息
strCom.append(url);
system(strCom.c_str());//执行wget
ifstreamfin(fileName.c_str());
if(!fin)
{
return"";
}
stringstrHtml="";
charchTemp[1024]="";
//读取网页文件到内存中
while(fin.getline(chTemp,1024))
{
strHtml.append(string(chTemp));
strcpy(chTemp,"");
}
fin.close();
strCom="rm-f";//删除文件命令,-f表示直接删除不做任何提示
strCom.append(fileName);
system(strCom.c_str());//删除刚才下载下来的文件
returnstrHtml;//返回网页源码
}
else
{
return"";
}
}
//通过Wget来获取网页
string GetHtmlByWget(string url)
{
//获取待下载网页文件名
string fileName = url.substr((int)url.find_last_of("/") + 1);
if(fileName != "")
{
string strCom = "wget -q "; //wget命令 ,-q表示不显示下载信息
strCom.append(url);
system(strCom.c_str()); //执行wget
ifstream fin(fileName.c_str());
if(!fin)
{
return "";
}
string strHtml = "";
char chTemp[1024] = "";
//读取网页文件到内存中
while(fin.getline(chTemp , 1024))
{
strHtml.append(string(chTemp));
strcpy(chTemp , "");
}
fin.close();
strCom = "rm -f "; //删除文件命令,-f表示直接删除不做任何提示
strCom.append(fileName);
system(strCom.c_str()); //删除刚才下载下来的文件
return strHtml; //返回网页源码
}
else
{
return "";
}
}
//通过GET获取网页源码
stringGetHtmlByGet(stringurl)
{
stringstrHtmlContent="";
intsockfd;
structsockaddr_inaddr;
structhostent*pURL;
chartext[RECVBUF];
//分析链接
UrlInfourlInfo=ParseURL(url);
stringsAccept="Accept:*/*\r\nAccept-Language:zh-cn\r\nAccept-Encoding:gzip,deflate";
//不同的主机UserAgent不同
stringsUserAgent="Mozilla/5.0(X11;U;Linuxi686;en-US)AppleWebKit/534.10(KHTML,likeGecko)Chrome/8.0.552.224Safari/534.10";
//将端口转换为字符串
chart[6];
stringstrPort;
sprintf(t,"%d",urlInfo.Port);
strPort=t;
//构造发送字符串
stringstrRequest="";
strRequest.append("GET");
strRequest.append(urlInfo.File);
strRequest.append("?");
strRequest.append(urlInfo.Body);
strRequest.append("HTTP/1.1\r\n");
strRequest.append(sAccept);
strRequest.append("\r\nUser-Agent:");
strRequest.append(sUserAgent);
strRequest.append("\r\nHost:");
strRequest.append(urlInfo.Host);
strRequest.append(":");
strRequest.append(strPort);
strRequest.append("\r\nConnection:Keep-Alive\r\n\r\n");
char*host=const_cast<char*>(urlInfo.Host.c_str());
sockfd=socket(AF_INET,SOCK_STREAM,IPPROTO_TCP);//TCP方式发送
pURL=gethostbyname(host);
addr.sin_family=AF_INET;
addr.sin_addr.s_addr=*((unsignedlong*)pURL->h_addr);
addr.sin_port=htons(80);
//连接
connect(sockfd,(structsockaddr*)&addr,sizeof(addr));
//发送
send(sockfd,const_cast<char*>(strRequest.c_str()),strRequest.length(),0);
//接受
while(recv(sockfd,text,RECVBUF,0)>0)
{
strHtmlContent.append(text);
bzero(text,RECVBUF);
}
//关闭socket
close(sockfd);
//返回接受结果
returnstrHtmlContent;
}
//通过GET获取网页源码
string GetHtmlByGet(string url)
{
string strHtmlContent = "";
int sockfd;
struct sockaddr_in addr;
struct hostent *pURL;
char text[RECVBUF];
//分析链接
UrlInfo urlInfo = ParseURL(url);
string sAccept = "Accept: */*\r\nAccept-Language: zh-cn\r\nAccept-Encoding: gzip, deflate";
//不同的主机UserAgent不同
string sUserAgent = "Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/534.10";
//将端口转换为字符串
char t[6];
string strPort;
sprintf(t,"%d", urlInfo.Port);
strPort = t;
//构造发送字符串
string strRequest = "";
strRequest.append("GET ");
strRequest.append(urlInfo.File);
strRequest.append("?");
strRequest.append(urlInfo.Body);
strRequest.append(" HTTP/1.1\r\n");
strRequest.append(sAccept);
strRequest.append("\r\nUser-Agent:");
strRequest.append(sUserAgent);
strRequest.append("\r\nHost:");
strRequest.append(urlInfo.Host);
strRequest.append(":");
strRequest.append(strPort);
strRequest.append("\r\nConnection: Keep-Alive\r\n\r\n");
char* host = const_cast<char*>(urlInfo.Host.c_str());
sockfd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); //TCP方式发送
pURL = gethostbyname(host);
addr.sin_family = AF_INET;
addr.sin_addr.s_addr = *((unsigned long*)pURL->h_addr);
addr.sin_port = htons(80);
//连接
connect(sockfd,(struct sockaddr *)&addr,sizeof(addr));
//发送
send(sockfd, const_cast<char*>(strRequest.c_str()), strRequest.length(), 0);
//接受
while(recv(sockfd, text, RECVBUF, 0) > 0)
{
strHtmlContent.append(text);
bzero(text,RECVBUF);
}
//关闭socket
close(sockfd);
//返回接受结果
return strHtmlContent;
}
#include<stdio.h>
#include<string.h>
#include<curl/curl.h>
#defineMAX_BUF65536
charwr_buf[MAX_BUF+1];
intwr_index;
/*
*Writedatacallbackfunction(calledwithinthecontextof
*curl_easy_perform.
*/
size_twrite_data(void*buffer,size_tsize,size_tnmemb,void*userp)
{
intsegsize=size*nmemb;
/*Checktoseeifthisdataexceedsthesizeofourbuffer.Ifso,
*settheuser-definedcontextvalueandreturn0toindicatea
*problemtocurl.
*/
if(wr_index+segsize>MAX_BUF){
*(int*)userp=1;
return0;
}
/*Copythedatafromthecurlbufferintoourbuffer*/
memcpy((void*)&wr_buf[wr_index],buffer,(size_t)segsize);
/*Updatethewriteindex*/
wr_index+=segsize;
/*Nullterminatethebuffer*/
wr_buf[wr_index]=0;
/*Returnthenumberofbytesreceived,indicatingtocurlthatallisokay*/
returnsegsize;
}
/*
*Simplecurlapplicationtoreadtheindex.htmlfilefromaWebsite.
*/
intmain(void)
{
CURL*curl;
CURLcoderet;
intwr_error;
wr_error=0;
wr_index=0;
/*Firststep,initcurl*/
curl=curl_easy_init();
if(!curl){
printf("couldntinitcurl\n");
return0;
}
/*TellcurltheURLofthefileweregoingtoretrieve*/
curl_easy_setopt(curl,CURLOPT_URL,"www.exampledomain.com");
/*Tellcurlthatwellreceivedatatothefunctionwrite_data,and
*alsoprovideitwithacontextpointerforourerrorreturn.
*/
curl_easy_setopt(curl,CURLOPT_WRITEDATA,(void*)&wr_error);
curl_easy_setopt(curl,CURLOPT_WRITEFUNCTION,write_data);
/*Allowcurltoperformtheaction*/
ret=curl_easy_perform(curl);
printf("ret=%d(write_error=%d)\n",ret,wr_error);
/*Emitthepageifcurlindicatesthatnoerrorsoccurred*/
if(ret==0)printf("%s\n",wr_buf);
curl_easy_cleanup(curl);
return0;
}
第一个为利用linux下的工具来获取网页源码 ,我用的是Wget ,也可以使用Curl ,curl的话更加的灵活 ,可以设置很多参数
C++代码
第二个是用的socket的来获取源码
C++代码
使用libcurl
Java代码
声明:本站所有文章 ,如无特殊说明或标注 ,均为本站原创发布 。任何个人或组织 ,在未征得本站同意时 ,禁止复制 、盗用 、采集 、发布本站内容到任何网站 、书籍等各类媒体平台 。如若本站内容侵犯了原著者的合法权益,可联系我们进行处理 。
创心域SEO版权声明:以上内容作者已申请原创保护,未经允许不得转载,侵权必究!授权事宜、对本内容有异议或投诉,敬请联系网站管理员,我们将尽快回复您,谢谢合作!