Spider2/include/module.h

83 lines
2.1 KiB
C

#ifndef __MODULE_H_
#define __MODULE_H_
#include <curl/curl.h>
#include "util.h"
typedef enum {
FILTER_PASS,
FILTER_STALL,
FILTER_REJECT,
} filterres_t;
typedef enum {
EXTRA_JSON,
EXTRA_TIDY,
EXTRA_OTHER,
} extradata_type_t;
typedef struct {
extradata_type_t type;
char *key;
void *val;
} extradata_t;
dynarr_def(extradata_t, extradata_dynarr_t);
typedef struct {
const char *url;
CURL *handle;
char *page;
size_t npage;
extradata_dynarr_t extradata;
charp_dynarr_t *parsedlinks;
} pagecompletedata_t;
typedef void (*reqcb_t)(void *userdata, const char *url, char *page, size_t npage, CURL *handle);
typedef struct {
const char *url;
reqcb_t cb;
void *userdata;
CURL *handle;
} requestedreq_t;
dynarr_def(requestedreq_t, requestedreq_dyanrr_t);
typedef struct crawlermodule {
void *userdata;
// `init` will both initialize the module, and populate all other functions in its entry, if necessary.
int (*init)(struct crawlermodule *entry);
int (*destroy)(void *userdata);
int (*onpagewrite)(void *userdata, const char *url, const byte *data, size_t ndata);
int (*onpagecomplete)(void *userdata, pagecompletedata_t *data);
int (*onpagedestroy)(void *userdata, pagecompletedata_t *data);
filterres_t (*filter)(void *userdata, const char *url);
} crawlermodule_t;
dynarr_def(crawlermodule_t, crawlermodule_dynarr_t);
dynarr_def(crawlermodule_t *, crawlermodulep_dynarr_t);
typedef struct {
const char *name;
crawlermodule_t module;
} moduleentry_t;
dynarr_def(moduleentry_t, moduleentry_dynarr_t);
dynarr_def(moduleentry_t *, moduleentryp_dynarr_t);
void *searchextradata(extradata_type_t type, char *key, extradata_t *data, size_t ndata);
void makerequest(const char *url, reqcb_t cb, void *cbdata);
int mod_pagedata_init(crawlermodule_t *entry);
int mod_tidy_init(crawlermodule_t *entry);
int mod_debug_init(crawlermodule_t *entry);
int mod_parse_init(crawlermodule_t *entry);
int mod_robots_init(crawlermodule_t *entry);
extern requestedreq_dyanrr_t requestedreqs;
extern moduleentry_t availmodules[];
#endif