Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/curl/docs/examples/htmltitle.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/curl/docs/examples/htmltitle.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,294 @@ +/*************************************************************************** + * _ _ ____ _ + * Project ___| | | | _ \| | + * / __| | | | |_) | | + * | (__| |_| | _ <| |___ + * \___|\___/|_| \_\_____| + * + * Copyright (C) 1998 - 2017, Daniel Stenberg, <daniel@haxx.se>, et al. + * + * This software is licensed as described in the file COPYING, which + * you should have received as part of this distribution. The terms + * are also available at https://curl.haxx.se/docs/copyright.html. + * + * You may opt to use, copy, modify, merge, publish, distribute and/or sell + * copies of the Software, and permit persons to whom the Software is + * furnished to do so, under the terms of the COPYING file. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ***************************************************************************/ +/* <DESC> + * Get a web page, extract the title with libxml. + * </DESC> + + Written by Lars Nilsson + + GNU C++ compile command line suggestion (edit paths accordingly): + + g++ -Wall -I/opt/curl/include -I/opt/libxml/include/libxml2 htmltitle.cpp \ + -o htmltitle -L/opt/curl/lib -L/opt/libxml/lib -lcurl -lxml2 +*/ +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <string> +#include <curl/curl.h> +#include <libxml/HTMLparser.h> + +// +// Case-insensitive string comparison +// + +#ifdef _MSC_VER +#define COMPARE(a, b) (!_stricmp((a), (b))) +#else +#define COMPARE(a, b) (!strcasecmp((a), (b))) +#endif + +// +// libxml callback context structure +// + +struct Context +{ + Context(): addTitle(false) { } + + bool addTitle; + std::string title; +}; + +// +// libcurl variables for error strings and returned data + +static char errorBuffer[CURL_ERROR_SIZE]; +static std::string buffer; + +// +// libcurl write callback function +// + +static int writer(char *data, size_t size, size_t nmemb, + std::string *writerData) +{ + if(writerData == NULL) + return 0; + + writerData->append(data, size*nmemb); + + return size * nmemb; +} + +// +// libcurl connection initialization +// + +static bool init(CURL *&conn, char *url) +{ + CURLcode code; + + conn = curl_easy_init(); + + if(conn == NULL) { + fprintf(stderr, "Failed to create CURL connection\n"); + exit(EXIT_FAILURE); + } + + code = curl_easy_setopt(conn, CURLOPT_ERRORBUFFER, errorBuffer); + if(code != CURLE_OK) { + fprintf(stderr, "Failed to set error buffer [%d]\n", code); + return false; + } + + code = curl_easy_setopt(conn, CURLOPT_URL, url); + if(code != CURLE_OK) { + fprintf(stderr, "Failed to set URL [%s]\n", errorBuffer); + return false; + } + + code = curl_easy_setopt(conn, CURLOPT_FOLLOWLOCATION, 1L); + if(code != CURLE_OK) { + fprintf(stderr, "Failed to set redirect option [%s]\n", errorBuffer); + return false; + } + + code = curl_easy_setopt(conn, CURLOPT_WRITEFUNCTION, writer); + if(code != CURLE_OK) { + fprintf(stderr, "Failed to set writer [%s]\n", errorBuffer); + return false; + } + + code = curl_easy_setopt(conn, CURLOPT_WRITEDATA, &buffer); + if(code != CURLE_OK) { + fprintf(stderr, "Failed to set write data [%s]\n", errorBuffer); + return false; + } + + return true; +} + +// +// libxml start element callback function +// + +static void StartElement(void *voidContext, + const xmlChar *name, + const xmlChar **attributes) +{ + Context *context = static_cast<Context *>(voidContext); + + if(COMPARE(reinterpret_cast<char *>(name), "TITLE")) { + context->title = ""; + context->addTitle = true; + } + (void) attributes; +} + +// +// libxml end element callback function +// + +static void EndElement(void *voidContext, + const xmlChar *name) +{ + Context *context = static_cast<Context *>(voidContext); + + if(COMPARE(reinterpret_cast<char *>(name), "TITLE")) + context->addTitle = false; +} + +// +// Text handling helper function +// + +static void handleCharacters(Context *context, + const xmlChar *chars, + int length) +{ + if(context->addTitle) + context->title.append(reinterpret_cast<char *>(chars), length); +} + +// +// libxml PCDATA callback function +// + +static void Characters(void *voidContext, + const xmlChar *chars, + int length) +{ + Context *context = static_cast<Context *>(voidContext); + + handleCharacters(context, chars, length); +} + +// +// libxml CDATA callback function +// + +static void cdata(void *voidContext, + const xmlChar *chars, + int length) +{ + Context *context = static_cast<Context *>(voidContext); + + handleCharacters(context, chars, length); +} + +// +// libxml SAX callback structure +// + +static htmlSAXHandler saxHandler = +{ + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + StartElement, + EndElement, + NULL, + Characters, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + cdata, + NULL +}; + +// +// Parse given (assumed to be) HTML text and return the title +// + +static void parseHtml(const std::string &html, + std::string &title) +{ + htmlParserCtxtPtr ctxt; + Context context; + + ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "", + XML_CHAR_ENCODING_NONE); + + htmlParseChunk(ctxt, html.c_str(), html.size(), 0); + htmlParseChunk(ctxt, "", 0, 1); + + htmlFreeParserCtxt(ctxt); + + title = context.title; +} + +int main(int argc, char *argv[]) +{ + CURL *conn = NULL; + CURLcode code; + std::string title; + + // Ensure one argument is given + + if(argc != 2) { + fprintf(stderr, "Usage: %s <url>\n", argv[0]); + exit(EXIT_FAILURE); + } + + curl_global_init(CURL_GLOBAL_DEFAULT); + + // Initialize CURL connection + + if(!init(conn, argv[1])) { + fprintf(stderr, "Connection initializion failed\n"); + exit(EXIT_FAILURE); + } + + // Retrieve content for the URL + + code = curl_easy_perform(conn); + curl_easy_cleanup(conn); + + if(code != CURLE_OK) { + fprintf(stderr, "Failed to get '%s' [%s]\n", argv[1], errorBuffer); + exit(EXIT_FAILURE); + } + + // Parse the (assumed) HTML code + parseHtml(buffer, title); + + // Display the extracted title + printf("Title: %s\n", title.c_str()); + + return EXIT_SUCCESS; +}
