57 lines
1.8 KiB
C
57 lines
1.8 KiB
C
#include <tidy.h>
|
|
#include <tidybuffio.h>
|
|
|
|
#include "util.h"
|
|
#include "module.h"
|
|
|
|
/* Traverse the document tree */
|
|
void dumpNode(TidyDoc doc, TidyNode tnod, int indent)
|
|
{
|
|
TidyNode child;
|
|
for(child = tidyGetChild(tnod); child; child = tidyGetNext(child) ) {
|
|
ctmbstr name = tidyNodeGetName(child);
|
|
if(name) {
|
|
/* if it has a name, then it's an HTML tag ... */
|
|
TidyAttr attr;
|
|
fprintf(stderr, "%*.*s%s ", indent, indent, "<", name);
|
|
/* walk the attribute list */
|
|
for(attr = tidyAttrFirst(child); attr; attr = tidyAttrNext(attr) ) {
|
|
fprintf(stderr, "%s", tidyAttrName(attr));
|
|
tidyAttrValue(attr)?fprintf(stderr, "=\"%s\" ",
|
|
tidyAttrValue(attr)):fprintf(stderr, " ");
|
|
}
|
|
fprintf(stderr, ">\n");
|
|
}
|
|
else {
|
|
/* if it does not have a name, then it's probably text, cdata, etc... */
|
|
TidyBuffer buf;
|
|
tidyBufInit(&buf);
|
|
tidyNodeGetText(doc, child, &buf);
|
|
fprintf(stderr, "%*.*s%s\n", indent, indent, "", buf.bp?(char *)buf.bp:"");
|
|
tidyBufFree(&buf);
|
|
}
|
|
dumpNode(doc, child, indent + 4); /* recursive */
|
|
}
|
|
}
|
|
|
|
int mod_debug_onpagecomplete(void *userdata, pagecompletedata_t *data) {
|
|
fprintf(stderr, "\n-- HTML for %s\n\n", data->url);
|
|
fwrite(data->page, 1, data->npage, stderr);
|
|
fprintf(stderr, "\n\n-- *CLEANED* HTML for %s\n\n", data->url);
|
|
TidyDoc doc = searchextradata(EXTRA_TIDY, "tidyDoc", data->extradata.data, data->extradata.len);
|
|
if (doc == NULL) {
|
|
error("\"tidyDoc\" entry not found. either mod_tidy failed or is not loaded before");
|
|
return -1;
|
|
}
|
|
dumpNode(doc, tidyGetRoot(doc), 0);
|
|
return 0;
|
|
}
|
|
|
|
int mod_debug_init(crawlermodule_t *entry) {
|
|
*entry = (crawlermodule_t) {
|
|
.init = entry->init,
|
|
.onpagecomplete = mod_debug_onpagecomplete,
|
|
};
|
|
return 0;
|
|
}
|