docs/examples/crawler.c - platform/external/curl - Git at Google

 /***************************************************************************
  *                                  _   _ ____  _
  *  Project                     ___| | | |  _ \| |
  *                             / __| | | | |_) | |
  *                            | (__| |_| |  _ <| |___
  *                             \___|\___/|_| \_\_____|
  *
  * Web crawler based on curl and libxml2.
  * Copyright (C) 2018 Jeroen Ooms <jeroenooms@gmail.com>
  * License: MIT
  *
  * To compile:
  *   gcc crawler.c $(pkg-config --cflags --libs libxml-2.0 libcurl)
  *
  */
 /* <DESC>
  * Web crawler based on curl and libxml2 to stress-test curl with
  * hundreds of concurrent connections to various servers.
  * </DESC>
  */

 /* Parameters */
 int max_con = 200;
 int max_total = 20000;
 int max_requests = 500;
 int max_link_per_page = 5;
 int follow_relative_links = 0;
 char *start_page = "https://www.reuters.com";

 #include <libxml/HTMLparser.h>
 #include <libxml/xpath.h>
 #include <libxml/uri.h>
 #include <curl/curl.h>
 #include <stdlib.h>
 #include <string.h>
 #include <math.h>
 #include <signal.h>

 int pending_interrupt = 0;
 void sighandler(int dummy)
 {
   pending_interrupt = 1;
 }

 /* resizable buffer */
 typedef struct {
   char *buf;
   size_t size;
 } memory;

 size_t grow_buffer(void *contents, size_t sz, size_t nmemb, void *ctx)
 {
   size_t realsize = sz * nmemb;
   memory *mem = (memory*) ctx;
   char *ptr = realloc(mem->buf, mem->size + realsize);
   if(!ptr) {
     /* out of memory */
     printf("not enough memory (realloc returned NULL)\n");
     return 0;
   }
   mem->buf = ptr;
   memcpy(&(mem->buf[mem->size]), contents, realsize);
   mem->size += realsize;
   return realsize;
 }

 CURL *make_handle(char *url)
 {
   CURL *handle = curl_easy_init();

   /* Important: use HTTP2 over HTTPS */
   curl_easy_setopt(handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2TLS);
   curl_easy_setopt(handle, CURLOPT_URL, url);

   /* buffer body */
   memory *mem = malloc(sizeof(memory));
   mem->size = 0;
   mem->buf = malloc(1);
   curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, grow_buffer);
   curl_easy_setopt(handle, CURLOPT_WRITEDATA, mem);
   curl_easy_setopt(handle, CURLOPT_PRIVATE, mem);

   /* For completeness */
   curl_easy_setopt(handle, CURLOPT_ACCEPT_ENCODING, "");
   curl_easy_setopt(handle, CURLOPT_TIMEOUT, 5L);
   curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L);
   curl_easy_setopt(handle, CURLOPT_MAXREDIRS, 10L);
   curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, 2L);
   curl_easy_setopt(handle, CURLOPT_COOKIEFILE, "");
   curl_easy_setopt(handle, CURLOPT_FILETIME, 1L);
   curl_easy_setopt(handle, CURLOPT_USERAGENT, "mini crawler");
   curl_easy_setopt(handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
   curl_easy_setopt(handle, CURLOPT_UNRESTRICTED_AUTH, 1L);
   curl_easy_setopt(handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY);
   curl_easy_setopt(handle, CURLOPT_EXPECT_100_TIMEOUT_MS, 0L);
   return handle;
 }

 /* HREF finder implemented in libxml2 but could be any HTML parser */
 size_t follow_links(CURLM *multi_handle, memory *mem, char *url)
 {
   int opts = HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | \
              HTML_PARSE_NOWARNING | HTML_PARSE_NONET;
   htmlDocPtr doc = htmlReadMemory(mem->buf, mem->size, url, NULL, opts);
   if(!doc)
     return 0;
   xmlChar *xpath = (xmlChar*) "//a/@href";
   xmlXPathContextPtr context = xmlXPathNewContext(doc);
   xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, context);
   xmlXPathFreeContext(context);
   if(!result)
     return 0;
   xmlNodeSetPtr nodeset = result->nodesetval;
   if(xmlXPathNodeSetIsEmpty(nodeset)) {
     xmlXPathFreeObject(result);
     return 0;
   }
   size_t count = 0;
   for(int i = 0; i < nodeset->nodeNr; i++) {
     double r = rand();
     int x = r * nodeset->nodeNr / RAND_MAX;
     const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode;
     xmlChar *href = xmlNodeListGetString(doc, node, 1);
     if(follow_relative_links) {
       xmlChar *orig = href;
       href = xmlBuildURI(href, (xmlChar *) url);
       xmlFree(orig);
     }
     char *link = (char *) href;
     if(!link || strlen(link) < 20)
       continue;
     if(!strncmp(link, "http://", 7) || !strncmp(link, "https://", 8)) {
       curl_multi_add_handle(multi_handle, make_handle(link));
       if(count++ == max_link_per_page)
         break;
     }
     xmlFree(link);
   }
   xmlXPathFreeObject(result);
   return count;
 }

 int is_html(char *ctype)
 {
   return ctype != NULL && strlen(ctype) > 10 && strstr(ctype, "text/html");
 }

 int main(void)
 {
   signal(SIGINT, sighandler);
   LIBXML_TEST_VERSION;
   curl_global_init(CURL_GLOBAL_DEFAULT);
   CURLM *multi_handle = curl_multi_init();
   curl_multi_setopt(multi_handle, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_con);
   curl_multi_setopt(multi_handle, CURLMOPT_MAX_HOST_CONNECTIONS, 6L);

   /* enables http/2 if available */
 #ifdef CURLPIPE_MULTIPLEX
   curl_multi_setopt(multi_handle, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX);
 #endif

   /* sets html start page */
   curl_multi_add_handle(multi_handle, make_handle(start_page));

   int msgs_left;
   int pending = 0;
   int complete = 0;
   int still_running = 1;
   while(still_running && !pending_interrupt) {
     int numfds;
     curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds);
     curl_multi_perform(multi_handle, &still_running);

     /* See how the transfers went */
     CURLMsg *m = NULL;
     while((m = curl_multi_info_read(multi_handle, &msgs_left))) {
       if(m->msg == CURLMSG_DONE) {
         CURL *handle = m->easy_handle;
         char *url;
         memory *mem;
         curl_easy_getinfo(handle, CURLINFO_PRIVATE, &mem);
         curl_easy_getinfo(handle, CURLINFO_EFFECTIVE_URL, &url);
         if(m->data.result == CURLE_OK) {
           long res_status;
           curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &res_status);
           if(res_status == 200) {
             char *ctype;
             curl_easy_getinfo(handle, CURLINFO_CONTENT_TYPE, &ctype);
             printf("[%d] HTTP 200 (%s): %s\n", complete, ctype, url);
             if(is_html(ctype) && mem->size > 100) {
               if(pending < max_requests && (complete + pending) < max_total) {
                 pending += follow_links(multi_handle, mem, url);
                 still_running = 1;
               }
             }
           }
           else {
             printf("[%d] HTTP %d: %s\n", complete, (int) res_status, url);
           }
         }
         else {
           printf("[%d] Connection failure: %s\n", complete, url);
         }
         curl_multi_remove_handle(multi_handle, handle);
         curl_easy_cleanup(handle);
         free(mem->buf);
         free(mem);
         complete++;
         pending--;
       }
     }
   }
   curl_multi_cleanup(multi_handle);
   curl_global_cleanup();
   return 0;
 }
	/***************************************************************************
	* _ _ ____ _
	* Project ___\| \| \| \| _ \\| \|
	* / __\| \| \| \| \|_) \| \|
	* \| (__\| \|_\| \| _ <\| \|___
	* \___\|\___/\|_\| \_\_____\|
	*
	* Web crawler based on curl and libxml2.
	* Copyright (C) 2018 Jeroen Ooms <jeroenooms@gmail.com>
	* License: MIT
	*
	* To compile:
	* gcc crawler.c $(pkg-config --cflags --libs libxml-2.0 libcurl)
	*
	*/
	/* <DESC>
	* Web crawler based on curl and libxml2 to stress-test curl with
	* hundreds of concurrent connections to various servers.
	* </DESC>
	*/

	/* Parameters */
	int max_con = 200;
	int max_total = 20000;
	int max_requests = 500;
	int max_link_per_page = 5;
	int follow_relative_links = 0;
	char *start_page = "https://www.reuters.com";

	#include <libxml/HTMLparser.h>
	#include <libxml/xpath.h>
	#include <libxml/uri.h>
	#include <curl/curl.h>
	#include <stdlib.h>
	#include <string.h>
	#include <math.h>
	#include <signal.h>

	int pending_interrupt = 0;
	void sighandler(int dummy)
	{
	pending_interrupt = 1;
	}

	/* resizable buffer */
	typedef struct {
	char *buf;
	size_t size;
	} memory;

	size_t grow_buffer(void contents, size_t sz, size_t nmemb, void ctx)
	{
	size_t realsize = sz * nmemb;
	memory mem = (memory) ctx;
	char *ptr = realloc(mem->buf, mem->size + realsize);
	if(!ptr) {
	/* out of memory */
	printf("not enough memory (realloc returned NULL)\n");
	return 0;
	}
	mem->buf = ptr;
	memcpy(&(mem->buf[mem->size]), contents, realsize);
	mem->size += realsize;
	return realsize;
	}

	CURL make_handle(char url)
	{
	CURL *handle = curl_easy_init();

	/* Important: use HTTP2 over HTTPS */
	curl_easy_setopt(handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2TLS);
	curl_easy_setopt(handle, CURLOPT_URL, url);

	/* buffer body */
	memory *mem = malloc(sizeof(memory));
	mem->size = 0;
	mem->buf = malloc(1);
	curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, grow_buffer);
	curl_easy_setopt(handle, CURLOPT_WRITEDATA, mem);
	curl_easy_setopt(handle, CURLOPT_PRIVATE, mem);

	/* For completeness */
	curl_easy_setopt(handle, CURLOPT_ACCEPT_ENCODING, "");
	curl_easy_setopt(handle, CURLOPT_TIMEOUT, 5L);
	curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L);
	curl_easy_setopt(handle, CURLOPT_MAXREDIRS, 10L);
	curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, 2L);
	curl_easy_setopt(handle, CURLOPT_COOKIEFILE, "");
	curl_easy_setopt(handle, CURLOPT_FILETIME, 1L);
	curl_easy_setopt(handle, CURLOPT_USERAGENT, "mini crawler");
	curl_easy_setopt(handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
	curl_easy_setopt(handle, CURLOPT_UNRESTRICTED_AUTH, 1L);
	curl_easy_setopt(handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY);
	curl_easy_setopt(handle, CURLOPT_EXPECT_100_TIMEOUT_MS, 0L);
	return handle;
	}

	/* HREF finder implemented in libxml2 but could be any HTML parser */
	size_t follow_links(CURLM multi_handle, memory mem, char *url)
	{
	int opts = HTML_PARSE_NOBLANKS \| HTML_PARSE_NOERROR \| \
	HTML_PARSE_NOWARNING \| HTML_PARSE_NONET;
	htmlDocPtr doc = htmlReadMemory(mem->buf, mem->size, url, NULL, opts);
	if(!doc)
	return 0;
	xmlChar xpath = (xmlChar) "//a/@href";
	xmlXPathContextPtr context = xmlXPathNewContext(doc);
	xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, context);
	xmlXPathFreeContext(context);
	if(!result)
	return 0;
	xmlNodeSetPtr nodeset = result->nodesetval;
	if(xmlXPathNodeSetIsEmpty(nodeset)) {
	xmlXPathFreeObject(result);
	return 0;
	}
	size_t count = 0;
	for(int i = 0; i < nodeset->nodeNr; i++) {
	double r = rand();
	int x = r * nodeset->nodeNr / RAND_MAX;
	const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode;
	xmlChar *href = xmlNodeListGetString(doc, node, 1);
	if(follow_relative_links) {
	xmlChar *orig = href;
	href = xmlBuildURI(href, (xmlChar *) url);
	xmlFree(orig);
	}
	char link = (char ) href;
	if(!link \|\| strlen(link) < 20)
	continue;
	if(!strncmp(link, "http://", 7) \|\| !strncmp(link, "https://", 8)) {
	curl_multi_add_handle(multi_handle, make_handle(link));
	if(count++ == max_link_per_page)
	break;
	}
	xmlFree(link);
	}
	xmlXPathFreeObject(result);
	return count;
	}

	int is_html(char *ctype)
	{
	return ctype != NULL && strlen(ctype) > 10 && strstr(ctype, "text/html");
	}

	int main(void)
	{
	signal(SIGINT, sighandler);
	LIBXML_TEST_VERSION;
	curl_global_init(CURL_GLOBAL_DEFAULT);
	CURLM *multi_handle = curl_multi_init();
	curl_multi_setopt(multi_handle, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_con);
	curl_multi_setopt(multi_handle, CURLMOPT_MAX_HOST_CONNECTIONS, 6L);

	/* enables http/2 if available */
	#ifdef CURLPIPE_MULTIPLEX
	curl_multi_setopt(multi_handle, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX);
	#endif

	/* sets html start page */
	curl_multi_add_handle(multi_handle, make_handle(start_page));

	int msgs_left;
	int pending = 0;
	int complete = 0;
	int still_running = 1;
	while(still_running && !pending_interrupt) {
	int numfds;
	curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds);
	curl_multi_perform(multi_handle, &still_running);

	/* See how the transfers went */
	CURLMsg *m = NULL;
	while((m = curl_multi_info_read(multi_handle, &msgs_left))) {
	if(m->msg == CURLMSG_DONE) {
	CURL *handle = m->easy_handle;
	char *url;
	memory *mem;
	curl_easy_getinfo(handle, CURLINFO_PRIVATE, &mem);
	curl_easy_getinfo(handle, CURLINFO_EFFECTIVE_URL, &url);
	if(m->data.result == CURLE_OK) {
	long res_status;
	curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &res_status);
	if(res_status == 200) {
	char *ctype;
	curl_easy_getinfo(handle, CURLINFO_CONTENT_TYPE, &ctype);
	printf("[%d] HTTP 200 (%s): %s\n", complete, ctype, url);
	if(is_html(ctype) && mem->size > 100) {
	if(pending < max_requests && (complete + pending) < max_total) {
	pending += follow_links(multi_handle, mem, url);
	still_running = 1;
	}
	}
	}
	else {
	printf("[%d] HTTP %d: %s\n", complete, (int) res_status, url);
	}
	}
	else {
	printf("[%d] Connection failure: %s\n", complete, url);
	}
	curl_multi_remove_handle(multi_handle, handle);
	curl_easy_cleanup(handle);
	free(mem->buf);
	free(mem);
	complete++;
	pending--;
	}
	}
	}
	curl_multi_cleanup(multi_handle);
	curl_global_cleanup();
	return 0;
	}