首先使用上一篇文章介绍的libcurl库post请求获取OJ题目HTML源码,在使用pcre库正则表达式匹配,注意pcre需要编译出lib和dll哦。
下面是作者写的抓取HDOJ题目信息的源代码,是不是很简单呢?
// Spider.cpp : Defines the entry point for the console application.
///*C/C++抓取OJ题目信息Author: Jungle WeiCreate Date: 2013-01-16*/
#define PCRE_STATIC // 静态库编译选项
#include <io.h>
#include "curl\curl.h"
#include <iostream>#include <cstdio>#include <string>#include <fstream>#include <stdio.h>#include "pcre.h"using namespace std;
#pragma comment(lib, "pcre.lib")#pragma comment(lib, "curllib.lib")#pragma comment(lib, "openldap.lib")#pragma comment(lib, "ssleay32.lib")
#define MAX_SIZE_BUF 1000000
CURL *curl;CURLcode res;char tmps[MAX_SIZE_BUF];
char username[1000]="username";char password[1000]="password";char tfilename[1000]="tmpfile.txt";char domain[255]="acm.hdu.edu.cn"; //acm.guet.edu.cnstruct curl_slist *headerlist=NULL;
#define DEBUG_PRINT(X) X
#define UCHAR unsigned char#define ULONG unsigned long#define CHAR char
#define BOOL_TRUE 0#define BOOL_FALSE 1
#define OVECCOUNT 30 /* should be a multiple of 3 */
size_t process_data(void *buffer, size_t size, size_t nmemb, void *user_p){FILE *fp = (FILE *)user_p;size_t return_size = fwrite(buffer, size, nmemb, fp);//cout << (char *)buffer << endl;return return_size;}
ULONG getInfoByTag(char *src, char *pattern, char *res){pcre *re;const char *error;int erroffset;int ovector[OVECCOUNT];int rc, i;//titlere = pcre_compile(pattern, 0, &error, &erroffset, NULL);if (re == NULL) { //如果编译失败,返回错误信息printf("PCRE compilation failed at offset %d: %s\n", erroffset, error);return BOOL_FALSE;}
rc = pcre_exec(re,NULL, src, strlen(src), 0, 0, ovector, OVECCOUNT);// 返回值:匹配成功返回非负数,没有匹配返回负数if (rc < 0) { //如果没有匹配,返回错误信息if (rc == PCRE_ERROR_NOMATCH) printf("Sorry, no match ...\n");else printf("Matching error %d\n", rc);pcre_free(re);return BOOL_FALSE;}
i = (rc==0)?(0):(rc-1);
// for (i = 0; i < rc; i++) //分别取出捕获分组 $0整个正则公式 $1第一个(){char *substring_start = src + ovector[2*i];int substring_length = ovector[2*i+1] - ovector[2*i];printf("%.*s\n", substring_length, substring_start);}
pcre_free(re); // 编译正则表达式re 释放内存return BOOL_TRUE;}
int getProblemInfo_Brief(char *filename){string res="",ts;FILE * fp=fopen(filename,"r");while (fgets(tmps, MAX_SIZE_BUF, fp)){ts +=tmps;}fclose(fp);
char patternTime [] = "(\\d*) MS"; // 将要被编译的字符串形式的正则表达式
char patternMemory [] = "(\\d*) K"; // 将要被编译的字符串形式的正则表达式
char patternTitle [] = "<h1 style='color:#1A5CC8'>([\\s\\S]*?)</h1>"; // 将要被编译的字符串形式的正则表达式char patternDescription [] = "Problem Description</div> <div class=panel_content>([\\s\\S]*?)</div><div class=panel_bottom> </div>"; // 将要被编译的字符串形式的正则表达式char patternInput [] = "Input</div> <div class=panel_content>([\\s\\S]*?)</div><div class=panel_bottom> </div>"; // 将要被编译的字符串形式的正则表达式char patternOutput [] = "Output</div> <div class=panel_content>([\\s\\S]*?)</div><div class=panel_bottom> </div>"; // 将要被编译的字符串形式的正则表达式char patternSampleInput [] = "Sample Input</div><div class=panel_content><pre><div style=\"font-family:Courier New,Courier,monospace;\">([\\s\\S]*?)</div></pre></div><div class=panel_bottom> </div>"; // 将要被编译的字符串形式的正则表达式char patternSampleOutput [] = "Sample Output</div><div class=panel_content><pre><div style=\"font-family:Courier New,Courier,monospace;\">([\\s\\S]*?)</div></pre></div><div class=panel_bottom> </div>"; // 将要被编译的字符串形式的正则表达式char patternAuthor [] = "Author</div> <div class=panel_content>([\\s\\S]*?)</div><div class=panel_bottom> </div>"; // 将要被编译的字符串形式的正则表达式//char patternTitle [] = "<h1 style='color:#1A5CC8'>([\\s\\S]*?)</h1>"; // 将要被编译的字符串形式的正则表达式puts("Time");getInfoByTag((char*)ts.c_str(), patternTime, NULL);
puts("Memoty");getInfoByTag((char*)ts.c_str(), patternMemory, NULL);puts("Title");getInfoByTag((char*)ts.c_str(), patternTitle, NULL);
puts("Description");getInfoByTag((char*)ts.c_str(), patternDescription, NULL);
puts("Input");getInfoByTag((char*)ts.c_str(), patternInput, NULL);
puts("Output");getInfoByTag((char*)ts.c_str(), patternOutput, NULL);
puts("Sample Input");getInfoByTag((char*)ts.c_str(), patternSampleInput, NULL);
puts("Sample Output");getInfoByTag((char*)ts.c_str(), patternSampleOutput, NULL);
puts("Author");getInfoByTag((char*)ts.c_str(), patternAuthor, NULL);
return 0;}
ULONG getProblemInfo(string pid){CURL *curl;CURLcode res;curl = curl_easy_init();if ( curl ) {FILE *fp = fopen(tfilename, "ab+");curl_easy_setopt( curl, CURLOPT_VERBOSE, 0L );curl_easy_setopt(curl, CURLOPT_COOKIEFILE, "hdu.cookie");char url[255] = {0};sprintf(url, "http://acm.hdu.edu.cn/showproblem.php?pid=%s", pid.c_str());//cout<<url;curl_easy_setopt( curl, CURLOPT_URL, url);curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, &process_data);curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);res = curl_easy_perform( curl );curl_easy_cleanup(curl);fclose(fp);}getProblemInfo_Brief(tfilename);
return BOOL_TRUE;}
int main(int argc, char *argv[]){if (access(tfilename, 0) == 0){DeleteFile(tfilename);}curl_global_init(CURL_GLOBAL_ALL);
if (BOOL_TRUE != getProblemInfo("1000")){puts("getProblemInfo failed.");}return 0;}
运行效果:Time1000Memoty32768TitleA + B ProblemDescriptionCalculate <i>A + B</i>.<br>InputEach line will contain two integers <i>A</i> and <i>B</i>. Process to end of file.<br>OutputFor each case, output <i>A + B</i> in one line.<br>Sample Input1 1Sample Output2AuthorHDOJPress any key to continue