Spider2/src/mod_debug.c

57 lines
1.8 KiB
C

#include <tidy.h>
#include <tidybuffio.h>
#include "util.h"
#include "module.h"
/* Traverse the document tree */
void dumpNode(TidyDoc doc, TidyNode tnod, int indent)
{
TidyNode child;
for(child = tidyGetChild(tnod); child; child = tidyGetNext(child) ) {
ctmbstr name = tidyNodeGetName(child);
if(name) {
/* if it has a name, then it's an HTML tag ... */
TidyAttr attr;
fprintf(stderr, "%*.*s%s ", indent, indent, "<", name);
/* walk the attribute list */
for(attr = tidyAttrFirst(child); attr; attr = tidyAttrNext(attr) ) {
fprintf(stderr, "%s", tidyAttrName(attr));
tidyAttrValue(attr)?fprintf(stderr, "=\"%s\" ",
tidyAttrValue(attr)):fprintf(stderr, " ");
}
fprintf(stderr, ">\n");
}
else {
/* if it does not have a name, then it's probably text, cdata, etc... */
TidyBuffer buf;
tidyBufInit(&buf);
tidyNodeGetText(doc, child, &buf);
fprintf(stderr, "%*.*s%s\n", indent, indent, "", buf.bp?(char *)buf.bp:"");
tidyBufFree(&buf);
}
dumpNode(doc, child, indent + 4); /* recursive */
}
}
int mod_debug_onpagecomplete(void *userdata, pagecompletedata_t *data) {
fprintf(stderr, "\n-- HTML for %s\n\n", data->url);
fwrite(data->page, 1, data->npage, stderr);
fprintf(stderr, "\n\n-- *CLEANED* HTML for %s\n\n", data->url);
TidyDoc doc = searchextradata(EXTRA_TIDY, "tidyDoc", data->extradata.data, data->extradata.len);
if (doc == NULL) {
error("\"tidyDoc\" entry not found. either mod_tidy failed or is not loaded before");
return -1;
}
dumpNode(doc, tidyGetRoot(doc), 0);
return 0;
}
int mod_debug_init(crawlermodule_t *entry) {
*entry = (crawlermodule_t) {
.init = entry->init,
.onpagecomplete = mod_debug_onpagecomplete,
};
return 0;
}