#include #include #include "util.h" #include "module.h" /* Traverse the document tree */ void dumpNode(TidyDoc doc, TidyNode tnod, int indent) { TidyNode child; for(child = tidyGetChild(tnod); child; child = tidyGetNext(child) ) { ctmbstr name = tidyNodeGetName(child); if(name) { /* if it has a name, then it's an HTML tag ... */ TidyAttr attr; fprintf(stderr, "%*.*s%s ", indent, indent, "<", name); /* walk the attribute list */ for(attr = tidyAttrFirst(child); attr; attr = tidyAttrNext(attr) ) { fprintf(stderr, "%s", tidyAttrName(attr)); tidyAttrValue(attr)?fprintf(stderr, "=\"%s\" ", tidyAttrValue(attr)):fprintf(stderr, " "); } fprintf(stderr, ">\n"); } else { /* if it does not have a name, then it's probably text, cdata, etc... */ TidyBuffer buf; tidyBufInit(&buf); tidyNodeGetText(doc, child, &buf); fprintf(stderr, "%*.*s%s\n", indent, indent, "", buf.bp?(char *)buf.bp:""); tidyBufFree(&buf); } dumpNode(doc, child, indent + 4); /* recursive */ } } int mod_debug_onpagecomplete(void *userdata, pagecompletedata_t *data) { fprintf(stderr, "\n-- HTML for %s\n\n", data->url); fwrite(data->page, 1, data->npage, stderr); fprintf(stderr, "\n\n-- *CLEANED* HTML for %s\n\n", data->url); TidyDoc doc = searchextradata(EXTRA_TIDY, "tidyDoc", data->extradata.data, data->extradata.len); if (doc == NULL) { error("\"tidyDoc\" entry not found. either mod_tidy failed or is not loaded before"); return -1; } dumpNode(doc, tidyGetRoot(doc), 0); return 0; } int mod_debug_init(crawlermodule_t *entry) { *entry = (crawlermodule_t) { .init = entry->init, .onpagecomplete = mod_debug_onpagecomplete, }; return 0; }