【GDOJ-VJUDGE】C/C++抓取OJ题目信息 —— Spider.cpp
ACSolo  发布于 11 年前 2.0k 0 0

首先使用上一篇文章介绍的libcurl库post请求获取OJ题目HTML源码,在使用pcre库正则表达式匹配,注意pcre需要编译出lib和dll哦。

 

下面是作者写的抓取HDOJ题目信息的源代码,是不是很简单呢?


// Spider.cpp : Defines the entry point for the console application.

//
/*
C/C++抓取OJ题目信息
Author: Jungle Wei
Create Date: 2013-01-16
*/

#define PCRE_STATIC // 静态库编译选项

#include <io.h>

#include "curl\curl.h"

#include <iostream>
#include <cstdio>
#include <string>
#include <fstream>
#include <stdio.h>
#include "pcre.h"

using namespace std;

#pragma comment(lib, "pcre.lib") 
#pragma comment(lib, "curllib.lib") 
#pragma comment(lib, "openldap.lib") 
#pragma comment(lib, "ssleay32.lib") 

#define MAX_SIZE_BUF 1000000

CURL *curl;
CURLcode res;
char tmps[MAX_SIZE_BUF];

char username[1000]="username";
char password[1000]="password";
char tfilename[1000]="tmpfile.txt";
char domain[255]="acm.hdu.edu.cn";  //acm.guet.edu.cn
struct curl_slist *headerlist=NULL;

#define DEBUG_PRINT(X)   X

#define UCHAR unsigned char
#define ULONG unsigned long
#define CHAR char

#define BOOL_TRUE 0
#define BOOL_FALSE 1

#define OVECCOUNT 30    /* should be a multiple of 3 */

size_t process_data(void *buffer, size_t size, size_t nmemb, void *user_p)
{
FILE *fp = (FILE *)user_p;
size_t return_size = fwrite(buffer, size, nmemb, fp);
//cout << (char *)buffer << endl;
return return_size;
}


ULONG getInfoByTag(char *src, char *pattern, char *res)
{
    pcre  *re;    
    const char *error;    
    int  erroffset;    
    int  ovector[OVECCOUNT];    
    int  rc, i;    
   
//title
re = pcre_compile(pattern, 0, &error, &erroffset, NULL);         
    if (re == NULL) {                 //如果编译失败,返回错误信息    
        printf("PCRE compilation failed at offset %d: %s\n", erroffset, error);    
        return BOOL_FALSE;    
    }    

    rc = pcre_exec(re,NULL, src, strlen(src), 0, 0, ovector, OVECCOUNT);    
// 返回值:匹配成功返回非负数,没有匹配返回负数    
    if (rc < 0) {                     //如果没有匹配,返回错误信息    
        if (rc == PCRE_ERROR_NOMATCH) printf("Sorry, no match ...\n");    
        else printf("Matching error %d\n", rc);    
        pcre_free(re);    
        return BOOL_FALSE;    
    }

i = (rc==0)?(0):(rc-1);

// for (i = 0; i < rc; i++) //分别取出捕获分组 $0整个正则公式 $1第一个()  
{               
        char *substring_start =  src + ovector[2*i];    
        int substring_length = ovector[2*i+1] - ovector[2*i];    
        
printf("%.*s\n", substring_length, substring_start);
    }   

pcre_free(re);                     // 编译正则表达式re 释放内存   
return BOOL_TRUE;
}

int getProblemInfo_Brief(char *filename)    
{    
string res="",ts;
    FILE * fp=fopen(filename,"r");
    while (fgets(tmps, MAX_SIZE_BUF, fp))
    {
        ts +=tmps;
    }
    fclose(fp);

char  patternTime [] = "(\\d*) MS";  // 将要被编译的字符串形式的正则表达式    

char  patternMemory [] = "(\\d*) K";  // 将要被编译的字符串形式的正则表达式    

char  patternTitle [] = "<h1 style='color:#1A5CC8'>([\\s\\S]*?)</h1>";  // 将要被编译的字符串形式的正则表达式    
    
char  patternDescription [] = "Problem Description</div> <div class=panel_content>([\\s\\S]*?)</div><div class=panel_bottom>&nbsp;</div>";  // 将要被编译的字符串形式的正则表达式    
    
char  patternInput [] = "Input</div> <div class=panel_content>([\\s\\S]*?)</div><div class=panel_bottom>&nbsp;</div>";  // 将要被编译的字符串形式的正则表达式    
    
char  patternOutput [] = "Output</div> <div class=panel_content>([\\s\\S]*?)</div><div class=panel_bottom>&nbsp;</div>";  // 将要被编译的字符串形式的正则表达式    
    
char  patternSampleInput [] = "Sample Input</div><div class=panel_content><pre><div style=\"font-family:Courier New,Courier,monospace;\">([\\s\\S]*?)</div></pre></div><div class=panel_bottom>&nbsp;</div>";  // 将要被编译的字符串形式的正则表达式    
char  patternSampleOutput [] = "Sample Output</div><div class=panel_content><pre><div style=\"font-family:Courier New,Courier,monospace;\">([\\s\\S]*?)</div></pre></div><div class=panel_bottom>&nbsp;</div>";  // 将要被编译的字符串形式的正则表达式    
    
char  patternAuthor [] = "Author</div> <div class=panel_content>([\\s\\S]*?)</div><div class=panel_bottom>&nbsp;</div>";  // 将要被编译的字符串形式的正则表达式    
    
//char  patternTitle [] = "<h1 style='color:#1A5CC8'>([\\s\\S]*?)</h1>";  // 将要被编译的字符串形式的正则表达式    
    
puts("Time");
getInfoByTag((char*)ts.c_str(), patternTime, NULL);

puts("Memoty");
getInfoByTag((char*)ts.c_str(), patternMemory, NULL);
puts("Title");
getInfoByTag((char*)ts.c_str(), patternTitle, NULL);

puts("Description");
getInfoByTag((char*)ts.c_str(), patternDescription, NULL);

puts("Input");
getInfoByTag((char*)ts.c_str(), patternInput, NULL);

puts("Output");
getInfoByTag((char*)ts.c_str(), patternOutput, NULL);

puts("Sample Input");
getInfoByTag((char*)ts.c_str(), patternSampleInput, NULL);

puts("Sample Output");
getInfoByTag((char*)ts.c_str(), patternSampleOutput, NULL);

puts("Author");
getInfoByTag((char*)ts.c_str(), patternAuthor, NULL);

    return 0;    
}    


ULONG getProblemInfo(string pid)
{
CURL *curl;
    CURLcode res;
    curl = curl_easy_init();
    if ( curl ) {
FILE *fp = fopen(tfilename, "ab+");
curl_easy_setopt( curl, CURLOPT_VERBOSE, 0L );
curl_easy_setopt(curl, CURLOPT_COOKIEFILE, "hdu.cookie");
char url[255] = {0};
sprintf(url, "http://acm.hdu.edu.cn/showproblem.php?pid=%s", pid.c_str());
//cout<<url;
curl_easy_setopt( curl, CURLOPT_URL, url);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, &process_data);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);
res = curl_easy_perform( curl );
curl_easy_cleanup(curl);
fclose(fp);
}
getProblemInfo_Brief(tfilename);

return BOOL_TRUE;
}

int main(int argc, char *argv[])
{
if (access(tfilename, 0) == 0)
{
DeleteFile(tfilename);
}
    curl_global_init(CURL_GLOBAL_ALL);

if (BOOL_TRUE != getProblemInfo("1000"))
{
puts("getProblemInfo failed.");
}
    return 0;
}

运行效果:
Time
1000
Memoty
32768
Title
A + B Problem
Description
Calculate <i>A + B</i>.<br>
Input
Each line will contain two integers <i>A</i> and <i>B</i>. Process to end of file.<br>
Output
For each case, output <i>A + B</i> in one line.<br>
Sample Input
1 1
Sample Output
2
Author
HDOJ
Press any key to continue