Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/curl/docs/examples/htmltidy.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/curl/docs/examples/htmltidy.c Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,128 @@ +/*************************************************************************** + * _ _ ____ _ + * Project ___| | | | _ \| | + * / __| | | | |_) | | + * | (__| |_| | _ <| |___ + * \___|\___/|_| \_\_____| + * + * Copyright (C) 1998 - 2017, Daniel Stenberg, <daniel@haxx.se>, et al. + * + * This software is licensed as described in the file COPYING, which + * you should have received as part of this distribution. The terms + * are also available at https://curl.haxx.se/docs/copyright.html. + * + * You may opt to use, copy, modify, merge, publish, distribute and/or sell + * copies of the Software, and permit persons to whom the Software is + * furnished to do so, under the terms of the COPYING file. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ***************************************************************************/ +/* <DESC> + * Download a document and use libtidy to parse the HTML. + * </DESC> + */ +/* + * LibTidy => https://www.html-tidy.org/ + */ + +#include <stdio.h> +#include <tidy.h> +#include <tidybuffio.h> +#include <curl/curl.h> + +/* curl write callback, to fill tidy's input buffer... */ +uint write_cb(char *in, uint size, uint nmemb, TidyBuffer *out) +{ + uint r; + r = size * nmemb; + tidyBufAppend(out, in, r); + return r; +} + +/* Traverse the document tree */ +void dumpNode(TidyDoc doc, TidyNode tnod, int indent) +{ + TidyNode child; + for(child = tidyGetChild(tnod); child; child = tidyGetNext(child) ) { + ctmbstr name = tidyNodeGetName(child); + if(name) { + /* if it has a name, then it's an HTML tag ... */ + TidyAttr attr; + printf("%*.*s%s ", indent, indent, "<", name); + /* walk the attribute list */ + for(attr = tidyAttrFirst(child); attr; attr = tidyAttrNext(attr) ) { + printf(tidyAttrName(attr)); + tidyAttrValue(attr)?printf("=\"%s\" ", + tidyAttrValue(attr)):printf(" "); + } + printf(">\n"); + } + else { + /* if it doesn't have a name, then it's probably text, cdata, etc... */ + TidyBuffer buf; + tidyBufInit(&buf); + tidyNodeGetText(doc, child, &buf); + printf("%*.*s\n", indent, indent, buf.bp?(char *)buf.bp:""); + tidyBufFree(&buf); + } + dumpNode(doc, child, indent + 4); /* recursive */ + } +} + + +int main(int argc, char **argv) +{ + if(argc == 2) { + CURL *curl; + char curl_errbuf[CURL_ERROR_SIZE]; + TidyDoc tdoc; + TidyBuffer docbuf = {0}; + TidyBuffer tidy_errbuf = {0}; + int err; + + curl = curl_easy_init(); + curl_easy_setopt(curl, CURLOPT_URL, argv[1]); + curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_errbuf); + curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L); + curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb); + + tdoc = tidyCreate(); + tidyOptSetBool(tdoc, TidyForceOutput, yes); /* try harder */ + tidyOptSetInt(tdoc, TidyWrapLen, 4096); + tidySetErrorBuffer(tdoc, &tidy_errbuf); + tidyBufInit(&docbuf); + + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &docbuf); + err = curl_easy_perform(curl); + if(!err) { + err = tidyParseBuffer(tdoc, &docbuf); /* parse the input */ + if(err >= 0) { + err = tidyCleanAndRepair(tdoc); /* fix any problems */ + if(err >= 0) { + err = tidyRunDiagnostics(tdoc); /* load tidy error buffer */ + if(err >= 0) { + dumpNode(tdoc, tidyGetRoot(tdoc), 0); /* walk the tree */ + fprintf(stderr, "%s\n", tidy_errbuf.bp); /* show errors */ + } + } + } + } + else + fprintf(stderr, "%s\n", curl_errbuf); + + /* clean-up */ + curl_easy_cleanup(curl); + tidyBufFree(&docbuf); + tidyBufFree(&tidy_errbuf); + tidyRelease(tdoc); + return err; + + } + else + printf("usage: %s <url>\n", argv[0]); + + return 0; +}
