Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/curl/docs/examples/crawler.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /*************************************************************************** | |
| 2 * _ _ ____ _ | |
| 3 * Project ___| | | | _ \| | | |
| 4 * / __| | | | |_) | | | |
| 5 * | (__| |_| | _ <| |___ | |
| 6 * \___|\___/|_| \_\_____| | |
| 7 * | |
| 8 * Web crawler based on curl and libxml2. | |
| 9 * Copyright (C) 2018 Jeroen Ooms <jeroenooms@gmail.com> | |
| 10 * License: MIT | |
| 11 * | |
| 12 * To compile: | |
| 13 * gcc crawler.c $(pkg-config --cflags --libs libxml-2.0 libcurl) | |
| 14 * | |
| 15 */ | |
| 16 /* <DESC> | |
| 17 * Web crawler based on curl and libxml2 to stress-test curl with | |
| 18 * hundreds of concurrent connections to various servers. | |
| 19 * </DESC> | |
| 20 */ | |
| 21 | |
| 22 /* Parameters */ | |
| 23 int max_con = 200; | |
| 24 int max_total = 20000; | |
| 25 int max_requests = 500; | |
| 26 int max_link_per_page = 5; | |
| 27 int follow_relative_links = 0; | |
| 28 char *start_page = "https://www.reuters.com"; | |
| 29 | |
| 30 #include <libxml/HTMLparser.h> | |
| 31 #include <libxml/xpath.h> | |
| 32 #include <libxml/uri.h> | |
| 33 #include <curl/curl.h> | |
| 34 #include <stdlib.h> | |
| 35 #include <string.h> | |
| 36 #include <math.h> | |
| 37 #include <signal.h> | |
| 38 | |
| 39 int pending_interrupt = 0; | |
| 40 void sighandler(int dummy) | |
| 41 { | |
| 42 pending_interrupt = 1; | |
| 43 } | |
| 44 | |
| 45 /* resizable buffer */ | |
| 46 typedef struct { | |
| 47 char *buf; | |
| 48 size_t size; | |
| 49 } memory; | |
| 50 | |
| 51 size_t grow_buffer(void *contents, size_t sz, size_t nmemb, void *ctx) | |
| 52 { | |
| 53 size_t realsize = sz * nmemb; | |
| 54 memory *mem = (memory*) ctx; | |
| 55 char *ptr = realloc(mem->buf, mem->size + realsize); | |
| 56 if(!ptr) { | |
| 57 /* out of memory */ | |
| 58 printf("not enough memory (realloc returned NULL)\n"); | |
| 59 return 0; | |
| 60 } | |
| 61 mem->buf = ptr; | |
| 62 memcpy(&(mem->buf[mem->size]), contents, realsize); | |
| 63 mem->size += realsize; | |
| 64 return realsize; | |
| 65 } | |
| 66 | |
| 67 CURL *make_handle(char *url) | |
| 68 { | |
| 69 CURL *handle = curl_easy_init(); | |
| 70 | |
| 71 /* Important: use HTTP2 over HTTPS */ | |
| 72 curl_easy_setopt(handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2TLS); | |
| 73 curl_easy_setopt(handle, CURLOPT_URL, url); | |
| 74 | |
| 75 /* buffer body */ | |
| 76 memory *mem = malloc(sizeof(memory)); | |
| 77 mem->size = 0; | |
| 78 mem->buf = malloc(1); | |
| 79 curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, grow_buffer); | |
| 80 curl_easy_setopt(handle, CURLOPT_WRITEDATA, mem); | |
| 81 curl_easy_setopt(handle, CURLOPT_PRIVATE, mem); | |
| 82 | |
| 83 /* For completeness */ | |
| 84 curl_easy_setopt(handle, CURLOPT_ACCEPT_ENCODING, ""); | |
| 85 curl_easy_setopt(handle, CURLOPT_TIMEOUT, 5L); | |
| 86 curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L); | |
| 87 curl_easy_setopt(handle, CURLOPT_MAXREDIRS, 10L); | |
| 88 curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, 2L); | |
| 89 curl_easy_setopt(handle, CURLOPT_COOKIEFILE, ""); | |
| 90 curl_easy_setopt(handle, CURLOPT_FILETIME, 1L); | |
| 91 curl_easy_setopt(handle, CURLOPT_USERAGENT, "mini crawler"); | |
| 92 curl_easy_setopt(handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY); | |
| 93 curl_easy_setopt(handle, CURLOPT_UNRESTRICTED_AUTH, 1L); | |
| 94 curl_easy_setopt(handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY); | |
| 95 curl_easy_setopt(handle, CURLOPT_EXPECT_100_TIMEOUT_MS, 0L); | |
| 96 return handle; | |
| 97 } | |
| 98 | |
| 99 /* HREF finder implemented in libxml2 but could be any HTML parser */ | |
| 100 size_t follow_links(CURLM *multi_handle, memory *mem, char *url) | |
| 101 { | |
| 102 int opts = HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | \ | |
| 103 HTML_PARSE_NOWARNING | HTML_PARSE_NONET; | |
| 104 htmlDocPtr doc = htmlReadMemory(mem->buf, mem->size, url, NULL, opts); | |
| 105 if(!doc) | |
| 106 return 0; | |
| 107 xmlChar *xpath = (xmlChar*) "//a/@href"; | |
| 108 xmlXPathContextPtr context = xmlXPathNewContext(doc); | |
| 109 xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, context); | |
| 110 xmlXPathFreeContext(context); | |
| 111 if(!result) | |
| 112 return 0; | |
| 113 xmlNodeSetPtr nodeset = result->nodesetval; | |
| 114 if(xmlXPathNodeSetIsEmpty(nodeset)) { | |
| 115 xmlXPathFreeObject(result); | |
| 116 return 0; | |
| 117 } | |
| 118 size_t count = 0; | |
| 119 for(int i = 0; i < nodeset->nodeNr; i++) { | |
| 120 double r = rand(); | |
| 121 int x = r * nodeset->nodeNr / RAND_MAX; | |
| 122 const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode; | |
| 123 xmlChar *href = xmlNodeListGetString(doc, node, 1); | |
| 124 if(follow_relative_links) { | |
| 125 xmlChar *orig = href; | |
| 126 href = xmlBuildURI(href, (xmlChar *) url); | |
| 127 xmlFree(orig); | |
| 128 } | |
| 129 char *link = (char *) href; | |
| 130 if(!link || strlen(link) < 20) | |
| 131 continue; | |
| 132 if(!strncmp(link, "http://", 7) || !strncmp(link, "https://", 8)) { | |
| 133 curl_multi_add_handle(multi_handle, make_handle(link)); | |
| 134 if(count++ == max_link_per_page) | |
| 135 break; | |
| 136 } | |
| 137 xmlFree(link); | |
| 138 } | |
| 139 xmlXPathFreeObject(result); | |
| 140 return count; | |
| 141 } | |
| 142 | |
| 143 int is_html(char *ctype) | |
| 144 { | |
| 145 return ctype != NULL && strlen(ctype) > 10 && strstr(ctype, "text/html"); | |
| 146 } | |
| 147 | |
| 148 int main(void) | |
| 149 { | |
| 150 signal(SIGINT, sighandler); | |
| 151 LIBXML_TEST_VERSION; | |
| 152 curl_global_init(CURL_GLOBAL_DEFAULT); | |
| 153 CURLM *multi_handle = curl_multi_init(); | |
| 154 curl_multi_setopt(multi_handle, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_con); | |
| 155 curl_multi_setopt(multi_handle, CURLMOPT_MAX_HOST_CONNECTIONS, 6L); | |
| 156 | |
| 157 /* enables http/2 if available */ | |
| 158 #ifdef CURLPIPE_MULTIPLEX | |
| 159 curl_multi_setopt(multi_handle, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX); | |
| 160 #endif | |
| 161 | |
| 162 /* sets html start page */ | |
| 163 curl_multi_add_handle(multi_handle, make_handle(start_page)); | |
| 164 | |
| 165 int msgs_left; | |
| 166 int pending = 0; | |
| 167 int complete = 0; | |
| 168 int still_running = 1; | |
| 169 while(still_running && !pending_interrupt) { | |
| 170 int numfds; | |
| 171 curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds); | |
| 172 curl_multi_perform(multi_handle, &still_running); | |
| 173 | |
| 174 /* See how the transfers went */ | |
| 175 CURLMsg *m = NULL; | |
| 176 while((m = curl_multi_info_read(multi_handle, &msgs_left))) { | |
| 177 if(m->msg == CURLMSG_DONE) { | |
| 178 CURL *handle = m->easy_handle; | |
| 179 char *url; | |
| 180 memory *mem; | |
| 181 curl_easy_getinfo(handle, CURLINFO_PRIVATE, &mem); | |
| 182 curl_easy_getinfo(handle, CURLINFO_EFFECTIVE_URL, &url); | |
| 183 if(m->data.result == CURLE_OK) { | |
| 184 long res_status; | |
| 185 curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &res_status); | |
| 186 if(res_status == 200) { | |
| 187 char *ctype; | |
| 188 curl_easy_getinfo(handle, CURLINFO_CONTENT_TYPE, &ctype); | |
| 189 printf("[%d] HTTP 200 (%s): %s\n", complete, ctype, url); | |
| 190 if(is_html(ctype) && mem->size > 100) { | |
| 191 if(pending < max_requests && (complete + pending) < max_total) { | |
| 192 pending += follow_links(multi_handle, mem, url); | |
| 193 still_running = 1; | |
| 194 } | |
| 195 } | |
| 196 } | |
| 197 else { | |
| 198 printf("[%d] HTTP %d: %s\n", complete, (int) res_status, url); | |
| 199 } | |
| 200 } | |
| 201 else { | |
| 202 printf("[%d] Connection failure: %s\n", complete, url); | |
| 203 } | |
| 204 curl_multi_remove_handle(multi_handle, handle); | |
| 205 curl_easy_cleanup(handle); | |
| 206 free(mem->buf); | |
| 207 free(mem); | |
| 208 complete++; | |
| 209 pending--; | |
| 210 } | |
| 211 } | |
| 212 } | |
| 213 curl_multi_cleanup(multi_handle); | |
| 214 curl_global_cleanup(); | |
| 215 return 0; | |
| 216 } |
