commit 20c3cd55106adcc3d1404b9be22c96f76e5b6b83 Author: Ava Pagefault Date: Sun Sep 1 10:12:11 2024 -0400 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c3c62dc --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +build/ +.cache/ +result +outputs/ diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..a097b53 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,52 @@ +cmake_minimum_required(VERSION 3.10) + +set(CMAKE_C_STANDARD 11) +set(CMAKE_C_STANDARD_REQUIRED ON) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON CACHE INTERNAL "") + +set(CMAKE_C_FLAGS_RELEASE_INIT "-Wall -Wextra -Wpedantic -Wno-language-extension-token -Wno-gnu-statement-expression-from-macro-expansion") +set(CMAKE_C_FLAGS_DEBUG_INIT "${CMAKE_C_FLAGS_RELEASE_INIT} -gdwarf-4") + +project(Spider2 VERSION 1.0) + +list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) + +find_package(CURL REQUIRED) + +# Main executable + +file(GLOB_RECURSE srcFiles src/*.c) +add_executable(${PROJECT_NAME} ${srcFiles}) +target_link_libraries(${PROJECT_NAME} CURL::libcurl) +target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include) +target_link_options(${PROJECT_NAME} PRIVATE -ltidy) + +# Tests + +include(CTest) + +#file(GLOB_RECURSE testsToRun tests/*.c) +#list(FILTER srcFiles EXCLUDE REGEX main.c) + +set(testsToRun tests/dynarr_extensions.c tests/dynarr_get.c tests/dynarr_get1_death.c tests/dynarr_get2_death.c tests/dynarr_get3_death.c tests/dynarr_insert.c tests/deque_push.c tests/deque_pop.c tests/json_write.c tests/hset_iter.c tests/hset_add.c) + +create_test_sourcelist(tests CommonTests.c ${testsToRun}) +add_executable(CommonTests ${tests} src/util.c src/json.c) +target_link_libraries(CommonTests CURL::libcurl) +target_include_directories(CommonTests PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include) +target_compile_options(CommonTests PRIVATE -gdwarf-4) +target_link_options(CommonTests PRIVATE -gdwarf-4) +#target_compile_options(CommonTests PRIVATE -gdwarf-4 -pg) +#target_link_options(CommonTests PRIVATE -gdwarf-4 -pg) + +foreach(testFile IN LISTS testsToRun) + get_filename_component(testName ${testFile} NAME_WE) + add_test(NAME ${testName} COMMAND CommonTests tests/${testName}) + string(REGEX MATCH "_death$" shouldDie ${testName}) + if(shouldDie) + set_property(TEST ${testName} PROPERTY WILL_FAIL ON) + endif() +endforeach() + +# Install rules +install(TARGETS Spider2) diff --git a/README.md b/README.md new file mode 100644 index 0000000..4ba37fd --- /dev/null +++ b/README.md @@ -0,0 +1,12 @@ +# Shitty http spider in C +lol +## Build Instructions +In order to build this, you need: cmake, curl, tidy, a c compiler, as well as all of the appropriate development header packages. +``` +mkdir build +cd build +cmake -S .. -B . -DCMAKE_BUILD_TYPE=Release +cmake --build . +``` +After you built it, the binary should be in `./build/Spider2` relative to the repo root. +To run the tests, run `ctest` in `./build` relative to the repo root. diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..852582c --- /dev/null +++ b/flake.lock @@ -0,0 +1,27 @@ +{ + "nodes": { + "nixpkgs": { + "locked": { + "lastModified": 1718350267, + "narHash": "sha256-hrf/m9msEun15Vbs8+IOijFe4Sb58KxG/BnDSL9xgZQ=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "ecbc30d5ed9f75449233b17d4a4cdeab53af793f", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "release-24.05", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "nixpkgs": "nixpkgs" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..c6c0777 --- /dev/null +++ b/flake.nix @@ -0,0 +1,56 @@ +{ + inputs.nixpkgs.url = "github:NixOS/nixpkgs/release-24.05"; + outputs = { self, nixpkgs }: + let + defaultArchs = [ "x86_64-linux" "x86_64-darwin" "aarch64-linux" "aarch64-darwin" ]; + forAllSystems = nixpkgs.lib.attrsets.genAttrs defaultArchs; + nixpkgsFor = forAllSystems (system: import nixpkgs { inherit system; overlays = [ self.overlay ]; }); + in + { + overlay = final: prev: { + spider2 = final.stdenv.mkDerivation { + name = "Spider2"; + src = ./.; + nativeBuildInputs = with final; [ + cmake + pkg-config + ]; + buildInputs = with final; [ curl html-tidy ]; + dontUnpack = true; + configurePhase = '' + runHook preConfigure + cmake -S $src -B . -DCMAKE_BUILD_TYPE=DEBUG + runHook postConfigure + ''; + buildPhase = '' + runHook preBuild + cmake --build . + runHook postBuild + ''; + installPhase = '' + runHook preInstall + cmake --install . --prefix $out + runHook postInstall + ''; + }; + spider2WithDebug = final.spider2.overrideAttrs (_: _: { + hardeningDisable = [ "all" ]; + dontStrip = true; + }); + }; + packages = forAllSystems (system: { + default = nixpkgsFor."${system}".spider2; + inherit (nixpkgsFor."${system}") spider2WithDebug; + }); + devShells = forAllSystems (system: + let + pkgs = nixpkgsFor."${system}"; + debugPkgs = with pkgs; [ clang-tools valgrind gdb ]; + in + { + default = pkgs.spider2WithDebug.overrideAttrs (finalAttrs: previousAttrs: { + nativeBuildInputs = previousAttrs.nativeBuildInputs ++ debugPkgs; + }); + }); + }; +} diff --git a/include/crawler.h b/include/crawler.h new file mode 100644 index 0000000..8b444a5 --- /dev/null +++ b/include/crawler.h @@ -0,0 +1,14 @@ +#ifndef __CRAWLER_H_ +#define __CRAWLER_H_ + +#include "module.h" + +typedef struct { + const char **allowedhosts; + double req_interval_s; + moduleentryp_dynarr_t enabledmodules; +} crawlerconfig_t; + +void crawler(const char *seed, const crawlerconfig_t *config); + +#endif diff --git a/include/http.h b/include/http.h new file mode 100644 index 0000000..c5b1de7 --- /dev/null +++ b/include/http.h @@ -0,0 +1,46 @@ +#ifndef __HTTP_H_ +#define __HTTP_H_ + +#include +#include + +#include "util.h" +#include "module.h" + +typedef struct { + int status, flags, num_requests; +} headercb_data_t; + +typedef struct { + byte *base, *begin, *end; + moduleentryp_dynarr_t *modules; + const char *url; + bool wasrequested; +} writecb_data_t; + +typedef struct { + writecb_data_t writecb_data; + headercb_data_t headercb_data; +} cbdata_t; + +#define HEADERCB_VALID_MIME (1 << 0) +#define HEADERCB_CONTENT_TYPE_ENCOUNTERED (1 << 1) + +headercb_data_t http_get_to_buf(CURLcode *res, CURL *curl, byte **cnt, size_t *cntlen); +char *relative2absolute(CURLU *curl_url_h, const char *parent, const char *relative); +charp_dynarr_t parsehrefs(CURLU *curl_url_h, const char *url, const char *page, size_t pagelen); + +extern const char *useragents[]; + +typedef struct { + const char *host; + struct curl_slist *headers; + int totalfailurecnt, failurecnt, visitcnt; +} hostentry_t; + +dynarr_def(hostentry_t, host_dynarr_t); + +CURL *makehandle(const char *url, hostentry_t *host_entry, cbdata_t *cbdata, bool wasrequested); +void initcbdata(const char *url, moduleentryp_dynarr_t *modules, cbdata_t *data); + +#endif diff --git a/include/json.h b/include/json.h new file mode 100644 index 0000000..0cce411 --- /dev/null +++ b/include/json.h @@ -0,0 +1,40 @@ +#ifndef __JSON_H_ +#define __JSON_H_ + +#include +#include + +#include "util.h" + +typedef enum { + JSON_OBJECT, + JSON_ARRAY, + JSON_STRING, + JSON_INT, + JSON_BOOL, + JSON_NULL, +} jsontype_t; + +typedef struct jsonval { + jsontype_t type; + void *data; +} jsonval_t; + +typedef struct { + const char *key; + jsonval_t val; +} jsonkv_t; + +dynarr_def(jsonval_t, jsonval_dynarr_t); +dynarr_def(jsonkv_t, jsonkv_dynarr_t); + +jsonval_t json_createobj(jsonkv_dynarr_t pairs); +jsonval_t json_createarr(jsonval_dynarr_t elems); +jsonval_t json_createstr(const char *str); +jsonval_t json_createint(long num); +jsonval_t json_createbool(bool val); +jsonval_t json_createnull(void); +void json_destroy(jsonval_t *val); +void json_write(FILE *out, jsonval_t *val); + +#endif diff --git a/include/module.h b/include/module.h new file mode 100644 index 0000000..9d8094b --- /dev/null +++ b/include/module.h @@ -0,0 +1,82 @@ +#ifndef __MODULE_H_ +#define __MODULE_H_ + +#include + +#include "util.h" + +typedef enum { + FILTER_PASS, + FILTER_STALL, + FILTER_REJECT, +} filterres_t; + +typedef enum { + EXTRA_JSON, + EXTRA_TIDY, + EXTRA_OTHER, +} extradata_type_t; + +typedef struct { + extradata_type_t type; + char *key; + void *val; +} extradata_t; + +dynarr_def(extradata_t, extradata_dynarr_t); + +typedef struct { + const char *url; + CURL *handle; + char *page; + size_t npage; + extradata_dynarr_t extradata; + charp_dynarr_t *parsedlinks; +} pagecompletedata_t; + +typedef void (*reqcb_t)(void *userdata, const char *url, char *page, size_t npage, CURL *handle); + +typedef struct { + const char *url; + reqcb_t cb; + void *userdata; + CURL *handle; +} requestedreq_t; + +dynarr_def(requestedreq_t, requestedreq_dyanrr_t); + +typedef struct crawlermodule { + void *userdata; + // `init` will both initialize the module, and populate all other functions in its entry, if necessary. + int (*init)(struct crawlermodule *entry); + int (*destroy)(void *userdata); + int (*onpagewrite)(void *userdata, const char *url, const byte *data, size_t ndata); + int (*onpagecomplete)(void *userdata, pagecompletedata_t *data); + int (*onpagedestroy)(void *userdata, pagecompletedata_t *data); + filterres_t (*filter)(void *userdata, const char *url); +} crawlermodule_t; + +dynarr_def(crawlermodule_t, crawlermodule_dynarr_t); +dynarr_def(crawlermodule_t *, crawlermodulep_dynarr_t); + +typedef struct { + const char *name; + crawlermodule_t module; +} moduleentry_t; + +dynarr_def(moduleentry_t, moduleentry_dynarr_t); +dynarr_def(moduleentry_t *, moduleentryp_dynarr_t); + +void *searchextradata(extradata_type_t type, char *key, extradata_t *data, size_t ndata); +void makerequest(const char *url, reqcb_t cb, void *cbdata); + +int mod_pagedata_init(crawlermodule_t *entry); +int mod_tidy_init(crawlermodule_t *entry); +int mod_debug_init(crawlermodule_t *entry); +int mod_parse_init(crawlermodule_t *entry); +int mod_robots_init(crawlermodule_t *entry); + +extern requestedreq_dyanrr_t requestedreqs; +extern moduleentry_t availmodules[]; + +#endif diff --git a/include/util.h b/include/util.h new file mode 100644 index 0000000..4fb04ab --- /dev/null +++ b/include/util.h @@ -0,0 +1,358 @@ +#ifndef __UTIL_H_ +#define __UTIL_H_ + +#include +#include +#include +#include +#include + +typedef enum { + LEVEL_DEBUG, + LEVEL_INFO, + LEVEL_WARN, + LEVEL_ERROR, + LEVEL_FATAL, +} loglevel_t; + +void volog(loglevel_t level, const char *file, int line, const char *fmt, va_list ap); +void olog(loglevel_t level, const char *file, int line, const char *fmt, ...); +void die(const char *fmt, ...); +void *xmalloc(size_t size); +void *xcalloc(size_t nmemb, size_t size); +void *xrealloc(void *ptr, size_t size); +void xfree(void *ptr); +size_t sanitize2ascii(char *out, const char *inp, size_t outsize); +char *sanitize2ascii_dyn(const char *inp, size_t maxlen); + +// wikipedia sample function. read comment in util.c +uint32_t murmur3_32(const uint8_t* key, size_t len, uint32_t seed); + +bool hset_charp_cmp(char **lhs, char **rhs); +size_t hset_charp_hash(char **str); + +#define min(A, B) \ + ({ __typeof__ (A) _a = (A); \ + __typeof__ (B) _b = (B); \ + _a < _b ? _a : _b; }) + +#define max(A, B) \ + ({ __typeof__ (A) _a = (A); \ + __typeof__ (B) _b = (B); \ + _a > _b ? _a : _b; }) + +#define DYNARR_INIT_CAP 16 +#define DEQUE_INIT_CAP 16 +#define HSET_INIT_CAP 256 + +#define debug(...) olog(LEVEL_DEBUG, __FILE__, __LINE__, __VA_ARGS__) +#define info(...) olog(LEVEL_INFO,__FILE__, __LINE__, __VA_ARGS__) +#define warn( ...) olog(LEVEL_WARN, __FILE__, __LINE__, __VA_ARGS__) +#define error(...) olog(LEVEL_ERROR, __FILE__, __LINE__, __VA_ARGS__) +#define fatal(...) olog(LEVEL_FATAL, __FILE__, __LINE__, __VA_ARGS__) + +#define array_size(ARR) (sizeof(ARR) / sizeof(typeof(*ARR))) + +#define dynarr_def(T, NAME) typedef DYNARR(T) NAME; typedef T NAME ## _innertype + +#define DYNARR(T) struct { \ + T* data; \ + size_t len, cap; \ + } + +#define dynarr_initi(ARR_T) (ARR_T){ .data = xmalloc(sizeof(ARR_T ## _innertype) * DYNARR_INIT_CAP), .len = 0, .cap = DYNARR_INIT_CAP } +#define dynarr_init(ARR_T, ARR) (ARR) = dynarr_initi(ARR_T) +#define dynarr_destroy(ARR) do { \ + if ((ARR).data == NULL) \ + break; \ + xfree((ARR).data); \ + (ARR).data = NULL; \ + } while(0) + +#define dynarr_push(ARR, ELEM) do { \ + if ((ARR).len >= (ARR).cap) { \ + (ARR).cap *= 2; \ + (ARR).data = xrealloc((ARR).data, (ARR).cap * sizeof(typeof(ELEM))); \ + } \ + (ARR).data[(ARR).len] = (ELEM); \ + (ARR).len += 1; \ + } while(0) + +#define dynarr_get(ARR, INDEX) ({ \ + size_t index = (INDEX); \ + if (index >= (ARR).len)\ + die("dyn array out of bounds access"); \ + &(ARR).data[index]; }) + +#define dynarr_pop(ARR) ({ \ + if ((ARR).len < 1) \ + die("dyn array empty array pop"); \ + (ARR).data[--(ARR).len]; }) + +#define dynarr_insert(ARR, INDEX, ELEM) do { \ + size_t index = (INDEX); \ + if (index > (ARR).len) \ + die("dyn array out of bounds insert"); \ + if ((ARR).len >= (ARR).cap) { \ + (ARR).cap *= 2; \ + (ARR).data = xrealloc((ARR).data, (ARR).cap * sizeof(ELEM)); \ + } \ + memmove((ARR).data + index + 1, (ARR).data + index, ((ARR).len - index) * sizeof(ELEM)); \ + (ARR).data[index] = (ELEM); \ + (ARR).len += 1; \ + } while (0) + +#define dynarr_remove(ARR, INDEX) do { \ + size_t index = INDEX;\ + if ((index) >= (ARR).len) \ + die("dyn array out of bounds remove"); \ + memmove((ARR).data + index, (ARR).data + index + 1, ((ARR).len - index - 1) * sizeof(*(ARR).data)); \ + (ARR).len -= 1; \ + } while (0) + +#define dynarr_extend_fixed(DYN_ARR, FIXED_ARR, NMEMB) do { \ + size_t nmemb = NMEMB; \ + size_t new_cap = (DYN_ARR).cap; \ + while ((DYN_ARR).len + nmemb > new_cap) \ + new_cap *= 2; \ + if (new_cap > (DYN_ARR).cap) { \ + (DYN_ARR).cap = new_cap; \ + (DYN_ARR).data = xrealloc((DYN_ARR).data, (DYN_ARR).cap * sizeof(*(DYN_ARR).data)); \ + } \ + memcpy((DYN_ARR).data + (DYN_ARR).len, FIXED_ARR, nmemb * sizeof(*(DYN_ARR).data)); \ + (DYN_ARR).len += nmemb; \ + } while(0) + +#define dynarr_extend_dyn(LHS, RHS) do { \ + size_t new_cap = (LHS).cap; \ + while ((LHS).len + (RHS).len > new_cap) \ + new_cap *= 2; \ + if (new_cap > (LHS).cap) { \ + (LHS).cap = new_cap; \ + (LHS).data = xrealloc((LHS).data, (LHS).cap * sizeof(*(LHS).data)); \ + } \ + memcpy((LHS).data + (LHS).len, (RHS).data, (RHS).len * sizeof(*(LHS).data)); \ + (LHS).len += (RHS).len; \ + } while(0) + +dynarr_def(size_t, size_dynarr_t); +dynarr_def(int, int_dynarr_t); +dynarr_def(long, long_dynarr_t); +dynarr_def(long long, long_long_dynarr_t); +dynarr_def(char *, charp_dynarr_t); +dynarr_def(char, char_dynarr_t); +dynarr_def(void *, vp_dynarr_t); + +#define DEQUE(T) struct { \ + T* base;\ + size_t cap, front, back, len; \ + } \ + +#define deque_def(T, NAME) typedef DEQUE(T) NAME; typedef T NAME ## _innertype + +//#define deque_init(DEQ_T) { .base = xmalloc(DEQUE_INIT_CAP * sizeof(DEQ_T ## _innertype)), .cap = DEQUE_INIT_CAP, .front = 0, .back = 0, .len = 0 } + +#define deque_init(DEQ_T, DEQ) do { (DEQ).base = xmalloc(DEQUE_INIT_CAP * sizeof(DEQ_T ## _innertype)), (DEQ).cap = DEQUE_INIT_CAP, (DEQ).front = 0, (DEQ).back = 0, (DEQ).len = 0; } while (0) + +#define deque_destroy(DEQ) do { \ + if ((DEQ).base == NULL)\ + break; \ + xfree((DEQ).base); \ + (DEQ).base = NULL; \ + } while (0) + +#define deque_grow(DEQ, NEW_CAP) do { \ + if ((DEQ).cap >= NEW_CAP) \ + continue; \ + size_t new_cap = NEW_CAP, size = sizeof(typeof(*(DEQ).base)); \ + typeof((DEQ).base) new_base = xmalloc(new_cap * size); \ + if ((DEQ).len > 0 && (DEQ).front >= (DEQ).back) { \ + size_t end_len = (DEQ).cap - (DEQ).front; \ + memcpy(new_base, (DEQ).base + (DEQ).front, end_len * size);\ + memcpy(new_base + end_len, (DEQ).base, (DEQ).back * size); \ + } \ + else { \ + memcpy(new_base, (DEQ).base + (DEQ).front, (DEQ).len * size); \ + } \ + xfree((DEQ).base); \ + (DEQ).base = new_base, (DEQ).cap = new_cap, (DEQ).front = 0, (DEQ).back = (DEQ).len; \ + } while (0) + +#define deque_push_back(DEQ, ELEM) do { \ + if ((DEQ).len > 0 && (DEQ).back == (DEQ).front) \ + deque_grow(DEQ, (DEQ).cap * 2); \ + (DEQ).base[(DEQ).back] = (ELEM); \ + (DEQ).back = ((DEQ).back + 1) % (DEQ).cap; \ + (DEQ).len += 1; \ + } while (0) + +#define deque_pop_back(DEQ) ({ \ + if ((DEQ).len == 0) \ + die("deque empty pop back"); \ + (DEQ).back = (DEQ).back == 0 ? (DEQ).cap - 1 : (DEQ).back - 1; \ + (DEQ).len -= 1;\ + (DEQ).base[(DEQ).back]; \ + }) + +#define deque_push_front(DEQ, ELEM) do { \ + if ((DEQ).len > 0 && (DEQ).back == (DEQ).front) \ + deque_grow(DEQ, (DEQ).cap * 2); \ + (DEQ).front = (DEQ).front == 0 ? (DEQ).cap - 1 : (DEQ).front - 1; \ + (DEQ).len += 1; \ + (DEQ).base[(DEQ).front] = (ELEM); \ + } while (0) + +#define deque_pop_front(DEQ) ({ \ + if ((DEQ).len == 0) \ + die("deque empty pop front"); \ + size_t old_front = (DEQ).front; \ + (DEQ).front = ((DEQ).front + 1) % (DEQ).cap; \ + (DEQ).len -= 1;\ + (DEQ).base[old_front]; \ + }) + +#define deque_get(DEQ, INDEX) ({ \ + size_t index = (INDEX); \ + if (index >= (DEQ).len) \ + die("deque out of bounds access"); \ + &(DEQ).base[((DEQ).front + index) % (DEQ).cap]; \ + }) + +#define deque_clone(DST, SRC) ({ \ + memcpy(&(DST), &(SRC), sizeof(DST)); \ + (DST).base = xmalloc((SRC).len * sizeof(typeof(*(DST).base))); \ + memcpy((DST).base, (SRC).base, (SRC).len * sizeof(typeof(*(DST).base))); \ + }) + +deque_def(int, int_deque_t); +deque_def(size_t, size_deque_t); +deque_def(long, long_deque_t); +deque_def(long long, longlong_deque_t); +deque_def(char, char_deque_t); +deque_def(char *, charp_deque_t); +deque_def(void *, voidp_deque_t); + +size_t parityhash(const void *data, size_t ndata); + +#define HSET_BUCKET(T, NAME) struct NAME ## _struct { \ + struct NAME ## _struct *next; \ + T data; \ + } + +#define HSET(T, BUCKET_T) struct { \ + BUCKET_T** buckets; \ + size_t nbuckets, len; \ + size_t (*algo)(T*); \ + bool (*cmp)(T*, T*); \ + } + +#define hset_def(T, NAME) typedef HSET_BUCKET(T, NAME ## _bucket) NAME ## _bucket; \ + typedef HSET(T, NAME ## _bucket) NAME; \ + typedef T NAME ## _innertype + +#define hset_initi(HSET_T, ALGO, CMP) (HSET_T) { \ + .buckets = xcalloc(HSET_INIT_CAP, sizeof(HSET_T ## _bucket *)), \ + .nbuckets = HSET_INIT_CAP, .len = 0, .algo = (ALGO), .cmp = (CMP), \ + } + +#define hset_init(HSET_T, HSET, ALGO, CMP) (HSET) = hset_initi(HSET_T, ALGO, CMP) + +#define hset_destroy(HSET) do { \ + if ((HSET).buckets == NULL) \ + break; \ + for (size_t i = 0; i < (HSET).nbuckets; i++) { \ + if ((HSET).buckets[i] == NULL) \ + continue; \ + typeof(*(HSET).buckets) cur = (HSET).buckets[i], next;\ + for (; cur != NULL; cur = next) { \ + next = cur->next; \ + xfree(cur); \ + } \ + } \ + xfree((HSET).buckets); \ + (HSET).buckets = NULL; \ + } while (0) + +// c has reminded me of how good i had it with rust +#define hset_iter(HSET, STATE) ({ \ + typeof(&(HSET).buckets[0]->data) ret;\ + do { \ + if ((STATE) == NULL) {\ + (STATE) = xmalloc(2 * sizeof(void*)); \ + ((void**) (STATE))[0] = (HSET).buckets; \ + ((void**) (STATE))[1] = NULL; \ + } \ + typeof(&(HSET).buckets) entryptr = &((typeof(&(HSET).buckets)) (STATE))[0];\ + typeof((HSET).buckets) bucketptr = &((typeof((HSET).buckets)) (STATE))[1];\ + if (*bucketptr != NULL && (*bucketptr)->next != NULL) { \ + *bucketptr = (*bucketptr)->next; \ + ret = &(*bucketptr)->data; \ + break; \ + } \ + *bucketptr = NULL; \ + for (; *entryptr < (HSET).buckets + (HSET).nbuckets; (*entryptr)++) \ + if (**entryptr != NULL) \ + break; \ + if (*entryptr < (HSET).buckets + (HSET).nbuckets) { \ + *bucketptr = **entryptr; \ + (*entryptr)++; \ + ret = &(*bucketptr)->data; \ + } else { \ + xfree((STATE)); \ + (STATE) = NULL; \ + ret = NULL; \ + } \ + } while (0); \ + ret; \ + }) + +#define hset_add(HSET, ELEM) ({ \ + size_t ind = (HSET).algo(&(ELEM)) % (HSET).nbuckets; \ + bool ret = false; \ + if ((HSET).buckets[ind] == NULL) {\ + (HSET).buckets[ind] = xmalloc(sizeof(typeof(**(HSET).buckets))); \ + (HSET).buckets[ind]->data = ELEM; \ + (HSET).buckets[ind]->next = NULL; \ + } \ + else { \ + typeof(*(HSET).buckets) cur, prev; \ + for (cur = (HSET).buckets[ind]; !ret && cur != NULL; prev = cur, cur = cur->next) { \ + if ((HSET).cmp == NULL) { \ + if(cur->data == (ELEM)) \ + ret = true; \ + } \ + else if ((*(HSET).cmp)(&cur->data, &(ELEM))) { \ + ret = true; \ + } \ + } \ + if (!ret) { \ + prev->next = xmalloc(sizeof(typeof(**(HSET).buckets))); \ + prev->next->data = ELEM; \ + prev->next->next = NULL; \ + } \ + } \ + ret; \ + }) + +#define hset_find(HSET, ELEM) ({ \ + size_t ind = (HSET).algo(&(ELEM)) % (HSET).nbuckets; \ + bool ret = false; \ + typeof(*(HSET).buckets) cur; \ + for (cur = (HSET).buckets[ind]; !ret && cur != NULL; cur = cur->next) { \ + if ((HSET).cmp == NULL) { \ + if(cur->data == (ELEM)) \ + ret = true; \ + } \ + else if ((*(HSET).cmp)(&cur->data, &(ELEM))) { \ + ret = true; \ + } \ + } \ + ret; \ + }) + +hset_def(int, int_hset_t); +hset_def(char *, charp_hset_t); + +typedef unsigned char byte; + +#endif diff --git a/src/crawler.c b/src/crawler.c new file mode 100644 index 0000000..61318a3 --- /dev/null +++ b/src/crawler.c @@ -0,0 +1,794 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "curl/multi.h" +#include "http.h" +#include "util.h" +#include "crawler.h" +#include "json.h" +#include "module.h" + +#define MAX_FAILURE_CNT 5 +#define MAX_CONNECTIONS 32 +#define MAX_CONNECTIONS_PER_HOST 5 +#define EPOLL_BUF_SIZE MAX_CONNECTIONS +#define DEATH_TIMEOUT_S 30 + +volatile sig_atomic_t shoulddie = 0; +volatile struct timespec fatalsigrecved = { 0 }; + +void fatalsighandler(int status) { + struct timespec now; + // clock_gettime is safe to call within a signal handler + if (clock_gettime(CLOCK_MONOTONIC, &now) < 0) + // die calls fprintf which is not safe to call within a signal handler, but we're going + // to die anyways so does it really matter? + die("clock_gettime failed: %s (code %d)", strerror(errno), errno); + if (shoulddie == 0) { + fatalsigrecved = now; + shoulddie = 1; + } + if (now.tv_sec - fatalsigrecved.tv_sec > DEATH_TIMEOUT_S) + die("death timeout exceeded (%ds)", DEATH_TIMEOUT_S); +} + +int perf_epoll_wait (int __epfd, struct epoll_event *__events, + int __maxevents, int __timeout) { + return epoll_wait(__epfd, __events, __maxevents, __timeout); +} + +CURLMcode perf_curl_multi_socket_action(CURLM *multi_handle, + curl_socket_t s, + int ev_bitmask, + int *running_handles) { + return curl_multi_socket_action(multi_handle, s, ev_bitmask, running_handles); +} + +bool is_allowed_host(const char *host, const char **allowed_hosts) { + if (allowed_hosts == NULL) + return true; + for (int i = 0; allowed_hosts[i] != NULL; i++) + if (strcmp(host, allowed_hosts[i]) == 0) + return true; + return false; +} + +CURLUcode append2tocrawl(CURLU *curl_url_h, char *link, const char **allowed_hosts, charp_deque_t *to_crawl) { + CURLUcode url_res; + url_res = curl_url_set(curl_url_h, CURLUPART_URL, link, 0); + if (url_res != CURLUE_OK){ + char sanitized[100]; + sanitize2ascii(sanitized, link, sizeof(sanitized)); + error("URL parsing failed for \"%s\": %s (code %d)", + sanitized, curl_url_strerror(url_res), url_res); + return url_res; + } + char *curl_link_host = NULL; + url_res = curl_url_get(curl_url_h, CURLUPART_HOST, &curl_link_host, CURLU_PUNYCODE); + if (url_res != CURLUE_OK) { + error("URL host parsing failed: %s (code %d)", curl_url_strerror(url_res), url_res); + return url_res; + } + if (!is_allowed_host(curl_link_host, allowed_hosts)) { + char sanitized[100]; + sanitize2ascii(sanitized, link, sizeof(sanitized)); + error("URL not in allowed hosts \"%s\", not crawling", sanitized); + curl_free(curl_link_host); + return CURLUE_BAD_HOSTNAME; + } + curl_free(curl_link_host); + deque_push_back(*to_crawl, link); + return CURLUE_OK; +} + +#define MAX_SOCKETS_PER_TRANS 4 + +typedef struct transfer { + struct transfer *next; + CURL *handle; + cbdata_t cbdata; + char *url; + int hostentryind; +} transfer_t; + +typedef struct { + const crawlerconfig_t *config; + charp_deque_t to_crawl; + charp_dynarr_t visited; + host_dynarr_t hostentries; + transfer_t *headtransfer; + int epollfd; + long timeout_ms; + CURLM *curl_multi_h; + int_dynarr_t sockets; + moduleentryp_dynarr_t modules; +} crawlerstate_t; + +typedef enum { + TRANS_OK, + TRANS_ERROR, + TRANS_VISITED, + TRANS_HOST_ERROR_EXCEEDED, + TRANS_HOST_ERROR, + TRANS_WRITE_ERR, + TRANS_FATAL, +} transfercode_t; + +int socketcb(CURL *easy, curl_socket_t s, int what, crawlerstate_t *clientp, transfer_t *socketp) { + //printf("socketcb(%p, %d, %d, %p, %p)\n", easy, s, what, (void*)clientp, (void*)socketp); + struct epoll_event event = { .data = { .fd = s } }; + int sockind = -1; + for (size_t i = 0; i < clientp->sockets.len; i++) { + if (s == clientp->sockets.data[i]) { + sockind = i; + break; + } + } + switch (what) { + case CURL_POLL_NONE: + return 0; + case CURL_POLL_IN: + event.events = EPOLLIN; + break; + case CURL_POLL_OUT: + event.events = EPOLLOUT; + break; + case CURL_POLL_INOUT: + event.events = EPOLLIN | EPOLLOUT; + break; + case CURL_POLL_REMOVE: + if (sockind < 0) { + fatal("invalid socket specified!"); + return -1; + } + if (epoll_ctl(clientp->epollfd, EPOLL_CTL_DEL, s, NULL) < 0) { + fatal("epoll_ctl failed: %s (code %d)", strerror(errno), errno); + return -1; + } + dynarr_remove(clientp->sockets, sockind); + return 0; + } + int op = sockind < 0 ? EPOLL_CTL_ADD : EPOLL_CTL_MOD; + if (epoll_ctl(clientp->epollfd, op, s, &event) < 0) { + fatal("epoll_ctl failed: %s (code %d)", strerror(errno), errno); + return -1; + } + if (op == EPOLL_CTL_ADD) + dynarr_push(clientp->sockets, s); + return 0; +} + +int timercb(CURLM *multi, long timeout_ms, long *clientp) { + *clientp = timeout_ms; + return 0; +} + +bool gethostdata(const char *host, const char **avail_uas, host_dynarr_t *host_data, int *hostentryind) { + const char *picked_ua = NULL; + for (size_t i = 0; i < host_data->len; i++) { + if (strcmp(host, host_data->data[i].host) == 0) { + *hostentryind = i; + return true; + } + } + if (avail_uas == NULL) { + *hostentryind = -1; + return false; + } + // pick one + size_t navail_uas = 0; + for (; avail_uas[navail_uas] != NULL; navail_uas++) + ; + picked_ua = avail_uas[(size_t) random() % navail_uas]; + const char prefix[] = "User-Agent: "; + char *new_buf = xmalloc(strlen(picked_ua) + sizeof(prefix)); + *new_buf = '\0'; + strcpy(new_buf, prefix); + strcat(new_buf, picked_ua); + struct curl_slist *headers = NULL; + if ((headers = curl_slist_append(headers, new_buf)) == NULL) { + xfree(new_buf); + curl_slist_free_all(headers); + *hostentryind = -1; + return false; + } + hostentry_t new_entry = { + .host = host, + .headers = headers, + .totalfailurecnt = 0, + .failurecnt = 0, + .visitcnt = 0 + }; + dynarr_push(*host_data, new_entry); + *hostentryind = host_data->len - 1; + return false; +} + +bool didvisit(const char *url, charp_dynarr_t *visited) { + int lhslen = strlen(url); + for (size_t i = 0; i < visited->len; i++) { + int rhslen = strlen(visited->data[i]); + if (lhslen != rhslen) + continue; + if (memcmp(url, visited->data[i], lhslen) == 0) + return true; + } + return false; +} + +transfercode_t starttransfer(CURLU *curl_url_h, crawlerstate_t *state, char *url, + transfer_t **transfer_ret, bool wasrequested) { + char sanitized[100]; + if (didvisit(url, &state->visited)) + // DO NOT free the URL. We are going to use it in our JSON later + return TRANS_VISITED; + dynarr_push(state->visited, url); + sanitize2ascii(sanitized, url, sizeof(sanitized)); + info("crawling \"%s\"", sanitized); + + // Retrieves handle and host data + *transfer_ret = xmalloc(sizeof(transfer_t)); + memset(*transfer_ret, 0, sizeof(transfer_t)); + transfer_t *transfer = *transfer_ret; + char *curl_host = NULL; + CURLUcode url_res; + if ((url_res = curl_url_set(curl_url_h, CURLUPART_URL, url, 0)) != CURLUE_OK || + (url_res = curl_url_get(curl_url_h, CURLUPART_HOST, &curl_host, CURLU_PUNYCODE) != CURLUE_OK)) { + fatal("URL host parsing failed: %s (code %d)", curl_url_strerror(url_res), url_res); + curl_free(curl_host); + xfree(transfer); + return TRANS_FATAL; + } + int hostentryind; + bool wascached = gethostdata(curl_host, useragents, &state->hostentries, &hostentryind); + if (hostentryind < 0) { + // host is (should be) in punycode so it's fine if we don't sanitize it + fatal("Failed to get host entry for host \"%s\"", curl_host); + curl_free(curl_host); + xfree(transfer); + return TRANS_FATAL; + } + if (wascached) + curl_free(curl_host); + hostentry_t *hostentry = &state->hostentries.data[hostentryind]; + initcbdata(url, &state->modules, &transfer->cbdata); + transfer->hostentryind = hostentryind; + transfer->url = url; + transfer->handle = makehandle(url, hostentry, &transfer->cbdata, wasrequested); + if (transfer->handle == NULL) { + fatal("makehandle() failed"); + xfree(transfer->cbdata.writecb_data.base); + xfree(transfer); + return TRANS_FATAL; + } + if (hostentry->failurecnt > MAX_FAILURE_CNT) { + error("Max failure count (%d) for host exceeded", MAX_FAILURE_CNT); + curl_easy_cleanup(transfer->handle); + xfree(transfer->cbdata.writecb_data.base); + xfree(transfer); + return TRANS_HOST_ERROR_EXCEEDED; + } + + // Adds the transfer to be the multi handle tbh idk what im doing + CURLMcode mc; + mc = curl_multi_add_handle(state->curl_multi_h, transfer->handle); + if (mc) { + // erm,what the sigma? + fatal("curl_multi_add_handle failed: %s (code %d)", curl_multi_strerror(mc), mc); + curl_easy_cleanup(transfer->handle); + xfree(transfer->cbdata.writecb_data.base); + xfree(transfer); + return TRANS_FATAL; + } + + // Add the transfer to the end of the transfer list + if (state->headtransfer == NULL) { + state->headtransfer = transfer; + } + else { + transfer_t *last = state->headtransfer; + for (; last->next != NULL; last = last->next) + ; + last->next = transfer; + } + + return TRANS_OK; +} + +void destroytransfer(CURLM *curl_multi_h, transfer_t *trans) { + curl_multi_remove_handle(curl_multi_h, trans->handle); + curl_easy_cleanup(trans->handle); + if (trans->cbdata.writecb_data.base != NULL) + xfree(trans->cbdata.writecb_data.base); + xfree(trans); +} + +// TODO: Extract arguments into a struct +transfercode_t transferfinished(CURLU *curl_url_h, crawlerstate_t *state, + CURLMsg *msg, transfer_t *trans, charp_dynarr_t *links, jsonkv_dynarr_t *extrajson, + reqcb_t cb, void *cb_userdata) { + CURLcode trans_res = msg->data.result; + char sanitized[100]; + hostentry_t *hostentry = &state->hostentries.data[trans->hostentryind]; + hostentry->visitcnt++; + // Check if transfer went OK + if (trans_res != CURLE_OK) { + sanitize2ascii(sanitized, trans->url, sizeof(sanitized)); + transfercode_t ret; + headercb_data_t *headerdata; + switch (trans_res) { + case CURLE_WRITE_ERROR: + headerdata = &trans->cbdata.headercb_data; + if (!(headerdata->flags & HEADERCB_VALID_MIME)) + info("request failed to \"%s\": is not of mime type text/html, not crawling", sanitized); + else if (headerdata->status != 200 && headerdata->status > 0) + info("request failed to \"%s\": returned status code %d", headerdata->status, sanitized); + else + info("request failed to \"%s\": header parsing error or page too big", sanitized); + ret = TRANS_WRITE_ERR; + break; + case CURLE_REMOTE_ACCESS_DENIED: + case CURLE_BAD_CONTENT_ENCODING: + case CURLE_PEER_FAILED_VERIFICATION: + case CURLE_WEIRD_SERVER_REPLY: + case CURLE_BAD_DOWNLOAD_RESUME: + hostentry->failurecnt++; + hostentry->totalfailurecnt++; + case CURLE_RANGE_ERROR: + case CURLE_UNSUPPORTED_PROTOCOL: + case CURLE_AUTH_ERROR: + case CURLE_LOGIN_DENIED: + case CURLE_TOO_MANY_REDIRECTS: + case CURLE_FILESIZE_EXCEEDED: + case CURLE_HTTP2: + case CURLE_HTTP3: + case CURLE_HTTP2_STREAM: + case CURLE_QUIC_CONNECT_ERROR: + error("non-fatal error: %s (code %d)", curl_easy_strerror(trans_res), trans_res); + ret = TRANS_ERROR; + break; + case CURLE_SSL_CONNECT_ERROR: + case CURLE_COULDNT_RESOLVE_HOST: + case CURLE_COULDNT_CONNECT: + case CURLE_OPERATION_TIMEDOUT: + info("retrying (eventually)...: %s (code %d)", curl_easy_strerror(trans_res), trans_res); + // Add to to_crawl and remove from visited so that we retry the transfer (eventually) + deque_push_front(state->to_crawl, trans->url); + for (size_t i = 0; i < state->visited.len; i++) { + // Shallow compare is fine + if (state->visited.data[i] == trans->url) { + dynarr_remove(state->visited, i); + break; + } + } + hostentry->failurecnt++; + hostentry->totalfailurecnt++; + ret = TRANS_HOST_ERROR; + break; + default: + fatal("aborting...: %s (code %d)", curl_easy_strerror(trans_res), trans_res); + ret = TRANS_FATAL; + break; + } + return ret; + } + hostentry->failurecnt = 0; + + // Link aggregation and parsing logic + writecb_data_t writecb_data = trans->cbdata.writecb_data; + char *page = (char*)writecb_data.base; + size_t npage = writecb_data.begin - writecb_data.base; + dynarr_init(charp_dynarr_t, *links); + debug("%ld links in %ld bytes ", links->len, npage); + pagecompletedata_t moduledata = { + .url = trans->url, + .handle = trans->handle, + .page = page, + .npage = npage, + .parsedlinks = links, + }; + + // Call callback and terminate, if provided + if (cb != NULL) { + dynarr_init(jsonkv_dynarr_t, *extrajson); + cb(cb_userdata, moduledata.url, moduledata.page, moduledata.npage, trans->handle); + return TRANS_OK; + } + + // Module and JSON stuff + dynarr_init(extradata_dynarr_t, moduledata.extradata); + int rc; + for (size_t i = 0; i < state->modules.len; i++) { + if (state->modules.data[i]->module.onpagecomplete != NULL) { + rc = state->modules.data[i]->module.onpagecomplete( + state->modules.data[i]->module.userdata, &moduledata); + if (rc != 0) + error("module %s onpagecomplete failed with code %d", state->modules.data[i]->name, rc); + } + } + for (size_t i = 0; i < links->len; i++) + append2tocrawl(curl_url_h, links->data[i], state->config->allowedhosts, &state->to_crawl); + for (size_t i = state->modules.len - 1; i != SIZE_MAX; i--) { + if (state->modules.data[i]->module.onpagedestroy != NULL) { + int rc; + rc = state->modules.data[i]->module.onpagedestroy( + state->modules.data[i]->module.userdata, &moduledata); + if (rc != 0) + error("module %s onpagedestroy failed with code %d", state->modules.data[i]->name, rc); + } + } + jsonkv_dynarr_t *jsoncand = + searchextradata(EXTRA_JSON, "json", moduledata.extradata.data, moduledata.extradata.len); + dynarr_destroy(moduledata.extradata); + // TODO: Make it so we don't initialize an empty array on no extrajson + if (jsoncand == NULL) + dynarr_init(jsonkv_dynarr_t, *extrajson); + else + *extrajson = *jsoncand; + return TRANS_OK; +} + +typedef struct { + char *url; + charp_dynarr_t links; + jsonkv_dynarr_t extrajson; +} linkentry_t; + +dynarr_def(linkentry_t, linkentry_dynarr_t); + +charp_dynarr_t getuniqlinks(linkentry_dynarr_t *links) { + // Filters for unique links + charp_hset_t uniqlinks_hset = hset_initi(charp_hset_t, hset_charp_hash, hset_charp_cmp); + for (size_t i = 0; i < links->len; i++) + for (size_t j = 0; j < links->data[i].links.len; j++) + hset_add(uniqlinks_hset, links->data[i].links.data[j]); + charp_dynarr_t uniqlinks = dynarr_initi(charp_dynarr_t); + void *saveptr = NULL; + char **link; + while ((link = hset_iter(uniqlinks_hset, saveptr)) != NULL) + dynarr_push(uniqlinks, *link); + hset_destroy(uniqlinks_hset); + return uniqlinks; +} + +// Does some json stuff idek +jsonval_t links2json(charp_dynarr_t *uniqlinks, linkentry_dynarr_t *links, host_dynarr_t *hosts) { + jsonkv_t kvpair; + + // Adds key/value pair for each host + jsonkv_dynarr_t hostmap = dynarr_initi(jsonkv_dynarr_t); + for (size_t i = 0; i < hosts->len; i++) { + hostentry_t *curhost = &hosts->data[i]; + jsonkv_dynarr_t entrymap = dynarr_initi(jsonkv_dynarr_t); + bool isup = curhost->visitcnt > curhost->totalfailurecnt; + kvpair = (jsonkv_t){ .key = "up", .val = json_createbool(isup) }; + dynarr_push(entrymap, kvpair); + kvpair = (jsonkv_t){ .key = "visitcnt", .val = json_createint(curhost->visitcnt) }; + dynarr_push(entrymap, kvpair); + kvpair = (jsonkv_t){ .key = "failurecnt", .val = json_createint(curhost->totalfailurecnt) }; + dynarr_push(entrymap, kvpair); + kvpair = (jsonkv_t){ .key = curhost->host, .val = json_createobj(entrymap) }; + dynarr_push(hostmap, kvpair); + } + + // Adds key/value pair for each visited link + jsonkv_dynarr_t linkmap = dynarr_initi(jsonkv_dynarr_t); + for (size_t i = 0; i < links->len; i++) { + linkentry_t *curentry = &links->data[i]; + jsonkv_dynarr_t entrymap = dynarr_initi(jsonkv_dynarr_t); + jsonval_dynarr_t urlindicies = dynarr_initi(jsonval_dynarr_t); + for (size_t j = 0; j < curentry->links.len; j++) { + char *cururl = curentry->links.data[j]; + size_t urlind; + for (urlind = 0; urlind < uniqlinks->len && strcmp(uniqlinks->data[urlind], cururl) != 0; urlind++) + ; + if (urlind >= uniqlinks->len) + // Just a baby drinking coffee + die("You should never see this message unless i fucked the hash set implementation"); + jsonval_t urlentry = json_createint(urlind); + dynarr_push(urlindicies, urlentry); + } + kvpair = (jsonkv_t){ .key = "link_indicies", .val = json_createarr(urlindicies) }; + dynarr_push(entrymap, kvpair); + kvpair = (jsonkv_t){ .key = "nlinks", .val = json_createint(urlindicies.len) }; + dynarr_push(entrymap, kvpair); + for (size_t i = 0; i < curentry->extrajson.len; i++) + dynarr_push(entrymap, curentry->extrajson.data[i]); + kvpair = (jsonkv_t){ .key = curentry->url, .val = json_createobj(entrymap) }; + dynarr_push(linkmap, kvpair); + } + + // Assembles object containing uniqlinks, hostmap, and linkmap + jsonkv_dynarr_t parentmap = dynarr_initi(jsonkv_dynarr_t); + jsonval_dynarr_t uniqlinks_json = dynarr_initi(jsonval_dynarr_t); + for (size_t i = 0; i < uniqlinks->len; i++) + dynarr_push(uniqlinks_json, json_createstr(uniqlinks->data[i])); + kvpair = (jsonkv_t) { .key = "hosts", .val = json_createobj(hostmap) }; + dynarr_push(parentmap, kvpair); + kvpair = (jsonkv_t) { .key = "urlindicies", .val = json_createarr(uniqlinks_json) }; + dynarr_push(parentmap, kvpair); + kvpair = (jsonkv_t) { .key = "links", .val = json_createobj(linkmap) }; + dynarr_push(parentmap, kvpair); + return json_createobj(parentmap); +} + +void crawler(const char *seed, const crawlerconfig_t *config) { + CURLU *curl_url_h = curl_url(); + if (curl_url_h == NULL) { + fatal("CURL URL failed to initialize"); + return; + } + + if ((signal(SIGINT, fatalsighandler) == SIG_ERR)) { + //(signal(SIGSEGV, fatalsighandler) == SIG_ERR) || + //(signal(SIGTERM, fatalsighandler) == SIG_ERR)) { + fatal("signal() failed: %s (code %d)", strerror(errno), errno); + return; + } + + crawlerstate_t state = { 0 }; + state.epollfd = -1; // For cleanup if error + deque_init(charp_deque_t, state.to_crawl); + dynarr_init(charp_dynarr_t, state.visited); + dynarr_init(host_dynarr_t, state.hostentries); + dynarr_init(int_dynarr_t, state.sockets); + state.config = config; + + char *seed_buf = xmalloc(strlen(seed) + 1); + strcpy(seed_buf, seed); + deque_push_back(state.to_crawl, seed_buf); + + struct epoll_event *event_buf = xmalloc(sizeof(struct epoll_event) * EPOLL_BUF_SIZE); + state.curl_multi_h = curl_multi_init(); + if (state.curl_multi_h == NULL) { + fatal("CURL Multi failed to initialize"); + goto cleanup; + } + CURLMcode mc; + if ((mc = curl_multi_setopt(state.curl_multi_h, CURLMOPT_SOCKETFUNCTION, socketcb)) || + (mc = curl_multi_setopt(state.curl_multi_h, CURLMOPT_SOCKETDATA, &state)) || + (mc = curl_multi_setopt(state.curl_multi_h, CURLMOPT_TIMERFUNCTION, timercb)) || + (mc = curl_multi_setopt(state.curl_multi_h, CURLMOPT_TIMERDATA, &state.timeout_ms)) || + (mc = curl_multi_setopt(state.curl_multi_h, CURLMOPT_MAX_HOST_CONNECTIONS, MAX_CONNECTIONS_PER_HOST)) || + (mc = curl_multi_setopt(state.curl_multi_h, CURLMOPT_MAX_TOTAL_CONNECTIONS, MAX_CONNECTIONS))) { + fatal("curl_multi_setopt failed: %s (code %d)", curl_multi_strerror(mc), mc); + goto cleanup; + } + + state.epollfd = epoll_create1(0); + if (state.epollfd < 0) { + fatal("epoll_create1 failed: %s (code %d)", strerror(errno), errno); + goto cleanup; + } + + linkentry_dynarr_t linkentries = dynarr_initi(linkentry_dynarr_t); + + state.modules = config->enabledmodules; + for (size_t i = 0; i < state.modules.len; i++) { + int (*init)(crawlermodule_t *) = state.modules.data[i]->module.init; + memset(&state.modules.data[i]->module, 0, sizeof(crawlermodule_t)); + state.modules.data[i]->module.init = init; + state.modules.data[i]->module.init(&state.modules.data[i]->module); + } + + size_t ntransfers = 0; + int running_handles = 0; + for(;;) { + if (shoulddie > 0) { + fatal("shoulddie = %d", shoulddie); + break; + } + + transfer_t *transfer; + for (size_t i = 0; i < requestedreqs.len && ntransfers < MAX_CONNECTIONS; i++) { + requestedreq_t *request = &requestedreqs.data[i]; + if (request->handle != NULL) + continue; + transfercode_t transcode = starttransfer(curl_url_h, &state, + (char*)request->url, &transfer, true); + if (transcode == TRANS_FATAL) + break; + else if (transcode != TRANS_OK) + continue; + transfer->cbdata.writecb_data.wasrequested = true; + request->handle = transfer->handle; + ntransfers++; + } + + charp_dynarr_t stalled = dynarr_initi(charp_dynarr_t); + for (size_t i = 0; i < state.to_crawl.len && ntransfers < MAX_CONNECTIONS; i++) { + char *crawling = deque_pop_front(state.to_crawl); + bool passed = true; + for (size_t i = 0; i < state.modules.len; i++) { + moduleentry_t *entry = state.modules.data[i]; + if (entry->module.filter == NULL) + continue; + filterres_t res = entry->module.filter(entry->module.userdata, crawling); + if (res != FILTER_PASS) { + passed = false; + char sanitized[100]; + sanitize2ascii(sanitized, crawling, sizeof(sanitized)); + if (res == FILTER_STALL) { + //debug("URL \"%s\" was stalled by module %s", sanitized, entry->name); + dynarr_push(stalled, crawling); + break; + } + else if (res == FILTER_REJECT) { + debug("URL \"%s\" was rejected by module %s", sanitized, entry->name); + // Don't add it to the visited list, in case the filter changes its mind + break; + } + } + } + if (!passed) + continue; + transfercode_t transcode = starttransfer(curl_url_h, &state, crawling, &transfer, false); + if (transcode == TRANS_FATAL) + break; + else if (transcode != TRANS_OK) + continue; + ntransfers++; + } + while (stalled.len > 0) + deque_push_back(state.to_crawl, dynarr_pop(stalled)); + + mc = perf_curl_multi_socket_action(state.curl_multi_h, -1, CURL_SOCKET_TIMEOUT, &running_handles); + if (mc) { + fatal("curl_multi_socket_action failed: %s (code %d)", curl_multi_strerror(mc), mc); + goto cleanup; + } + + if (running_handles == 0) + continue; + + // Main add/remove transfer loop + int availfds = perf_epoll_wait(state.epollfd, event_buf, EPOLL_BUF_SIZE, state.timeout_ms); + if (availfds < 0) { + fatal("epoll_wait failed: %s (code %d)", strerror(errno), errno); + break; + } + transfer_t *trans = state.headtransfer; + if (trans == NULL && state.to_crawl.len == 0) { + info("No more URLs left to crawl"); + break; + } + + // Tell CURL about any action connections + if (availfds > 0) { + for (int i = 0; i < availfds; i++) { + struct epoll_event *connevent = &event_buf[i]; + int ev_bitmask = 0; + // TODO: Check for errors on the descriptor + if (connevent->events & EPOLLIN) + ev_bitmask |= CURL_CSELECT_IN; + if (connevent->events & EPOLLOUT) + ev_bitmask |= CURL_CSELECT_OUT; + CURLMcode mc; + mc = perf_curl_multi_socket_action(state.curl_multi_h, connevent->data.fd, + ev_bitmask, &running_handles); + if (mc) { + fatal("curl_multi_socket_action failed: %s (code %d)", curl_multi_strerror(mc), mc); + goto cleanup; + } + } + } + else { + CURLMcode mc; + mc = perf_curl_multi_socket_action(state.curl_multi_h, CURL_SOCKET_TIMEOUT, -1, + &running_handles); + if (mc) { + fatal("curl_multi_socket_action failed: %s (code %d)", curl_multi_strerror(mc), mc); + break; + } + } + // Process and prunes any finished connections + CURLMsg *msg; + int nmsgs; + while ((msg = curl_multi_info_read(state.curl_multi_h, &nmsgs)) != NULL){ + // No other message types are currently defined, but in case they are + if (msg->msg != CURLMSG_DONE) + continue; + transfer_t *trans, *prev; + for (trans = state.headtransfer, prev = NULL; + trans != NULL; + prev = trans, trans = trans->next + ) + if (trans->handle == msg->easy_handle) + break; + if (trans == NULL) { + fatal("message handle not found, handle=%p, result=%d", + (void*)msg->easy_handle, msg->data.result); + goto cleanup; + } + + // Check if the request was requested + size_t requestind; + for (requestind = 0; + requestind < requestedreqs.len && + requestedreqs.data[requestind].handle != trans->handle; + requestind++) + ; + reqcb_t cb = NULL; + void *userdata = NULL; + if (requestind < requestedreqs.len) { + cb = requestedreqs.data[requestind].cb; + userdata = requestedreqs.data[requestind].userdata; + dynarr_remove(requestedreqs, requestind); + } + + // Handle it and log that we visited it + charp_dynarr_t links; + jsonkv_dynarr_t extrajson; + transfercode_t transcode = transferfinished(curl_url_h, &state, msg, trans, &links, + &extrajson, cb, userdata); + if (cb == NULL && transcode == TRANS_OK) { + linkentry_t entry = { .url = trans->url, .links = links, .extrajson = extrajson }; + dynarr_push(linkentries, entry); + } + + // Remove it + if (prev == NULL) + // Transfer was the head + state.headtransfer = trans->next; + else + // Transfer was not the head + prev->next = trans->next; + prev = trans; + destroytransfer(state.curl_multi_h, trans); + if (transcode == TRANS_FATAL) + goto cleanup; + ntransfers--; + } + } + + jsonval_t json; + charp_dynarr_t uniqlinks; +cleanup: + // Create and write out all of the json + uniqlinks = getuniqlinks(&linkentries); + json = links2json(&uniqlinks, &linkentries, &state.hostentries); + json_write(stdout, &json); + // Destroy all modules first bc they might have some save data + for (size_t i = 0; i < state.modules.len; i++) + if (state.modules.data[i]->module.destroy != NULL) + state.modules.data[i]->module.destroy(state.modules.data[i]->module.userdata); + // Destroy all links + json_destroy(&json); + for (size_t i = 0; i < linkentries.len; i++) + dynarr_destroy(linkentries.data[i].links); + dynarr_destroy(linkentries); + for (size_t i = 0; i < uniqlinks.len; i++) + xfree(uniqlinks.data[i]); + dynarr_destroy(uniqlinks); + // Destroy everything else + for (size_t i = 0; i < state.hostentries.len; i++) { + curl_slist_free_all(state.hostentries.data[i].headers); + curl_free((void*)state.hostentries.data[i].host); + } + for (transfer_t *trans = state.headtransfer, *next; trans != NULL; trans = next) { + next = trans->next; + destroytransfer(state.curl_multi_h, trans); + } + if (state.curl_multi_h != NULL) + curl_multi_cleanup(state.curl_multi_h); + if (state.epollfd >= 0) + close(state.epollfd); + xfree(event_buf); + curl_url_cleanup(curl_url_h); + dynarr_destroy(state.visited); + dynarr_destroy(state.hostentries); + dynarr_destroy(state.sockets); + dynarr_destroy(requestedreqs); + deque_destroy(state.to_crawl); +} diff --git a/src/http.c b/src/http.c new file mode 100644 index 0000000..5d9344a --- /dev/null +++ b/src/http.c @@ -0,0 +1,152 @@ +#include +#include +#include +#include +#include +#include + +#include "http.h" +#include "util.h" + +#define INIT_PAGE_SIZE 8192 +#define MAX_PAGE_SIZE 1048576 +#define MEM_INC_FACTOR 2 + +#define TIMEOUT_MS 10000 +#define CONNECT_TIMEOUT_MS 3000 + +const char *useragents[] = { + "AvaBot", + NULL, +}; + +size_t bufwritecb(const byte *ptr, size_t size, size_t nmemb, writecb_data_t *userdata) { + for (size_t i = 0; i < userdata->modules->len; i++) { + if (userdata->modules->data[i]->module.onpagewrite != NULL) { + int rc; + rc = userdata->modules->data[i]->module.onpagewrite( + userdata->modules->data[i]->module.userdata, userdata->url, ptr, nmemb); + if (rc != 0) + error("module %s onpagewrite failed with code %d", userdata->modules->data[i]->name); + } + } + size_t len = nmemb; + while (userdata->begin + nmemb + 1 > userdata->end) { + // Buffer is undersized + size_t buf_len = userdata->end - userdata->base; + if (buf_len >= MAX_PAGE_SIZE) + break; + size_t new_buf_len = buf_len * MEM_INC_FACTOR; + if (new_buf_len > MAX_PAGE_SIZE) + new_buf_len = MAX_PAGE_SIZE; + byte *new_base = xrealloc(userdata->base, new_buf_len); + userdata->end = new_base + new_buf_len; + userdata->begin = new_base + (userdata->begin - userdata->base); + userdata->base = new_base; + } + if (userdata->begin + nmemb + 1 > userdata->end) + // Buffer is still undersized + len = userdata->end - userdata->begin; + memcpy(userdata->begin, ptr, len); + userdata->begin[len] = '\0'; + userdata->begin += len; + return len; +} + +bool is_redirect(int status) { + return status == 301 || // Moved Permanently + status == 302 || // Found + status == 307 || // Temporary Redirect + status == 308; // Permanent Redirect +} + +size_t headerwritecb(const char *buffer, size_t _size, size_t nitems, headercb_data_t *userdata) { + const char content_type_str[] = "content-type:", html_mime_str[] = "text/html", http_str[] = "HTTP/"; + // Parses HTTP status line + if (nitems < sizeof(http_str) - 1) + return nitems; + if (memcmp(buffer, http_str, sizeof(http_str) - 1) == 0) { + // Header is an http status line + userdata->num_requests++; + const char *status_line = memchr(buffer, ' ', nitems); + for (; *status_line == ' '; status_line++) { + // Ensures that status_line...(buffer+nitems) can fit a status code (3 numbers) + if (status_line > buffer + nitems - 3) { + userdata->status = 0; + return CURL_WRITEFUNC_ERROR; + } + } + char code_str[4] = { status_line[0], status_line[1], status_line[2], '\0' }; + userdata->status = atoi(code_str); + if (userdata->status == 200 || is_redirect(userdata->status)) + return nitems; + return CURL_WRITEFUNC_ERROR; + } + if (userdata->status == 0 || is_redirect(userdata->status)) + return nitems; + // Parses Content-Type header + if (userdata->flags & HEADERCB_CONTENT_TYPE_ENCOUNTERED) + return nitems; + // We need an extra byte to plop header_val to the byte after the ':' + if (nitems < sizeof(content_type_str)) + return nitems; + for (size_t i = 0; i < sizeof(content_type_str)-1; i++) + if (tolower(buffer[i]) != content_type_str[i]) + return nitems; + const char *header_val = buffer + sizeof(content_type_str); + for (; *header_val == ' '; header_val++) + // Ensures that header_val..(buffer+nitems) can fit "text/html" + if (header_val > buffer + nitems - sizeof(html_mime_str) + 1) + return nitems; + userdata->flags |= HEADERCB_CONTENT_TYPE_ENCOUNTERED; + if (memcmp(header_val, html_mime_str, sizeof(html_mime_str)-1) != 0) + return CURL_WRITEFUNC_ERROR; + userdata->flags |= HEADERCB_VALID_MIME; + return nitems; +} + +void initcbdata(const char *url, moduleentryp_dynarr_t *modules, cbdata_t *data) { + memset(data, 0, sizeof(cbdata_t)); + data->writecb_data.base = xmalloc(INIT_PAGE_SIZE); + data->writecb_data.begin = data->writecb_data.base; + data->writecb_data.end = data->writecb_data.base + INIT_PAGE_SIZE; + data->writecb_data.url = url; + data->writecb_data.modules = modules; +} + +CURL *makehandle(const char *url, hostentry_t *host_entry, cbdata_t *cbdata, bool wasrequested) { + CURL *curl_h = curl_easy_init(); + CURLcode easy_res; + if (curl_h == NULL) { + error("curl failed to initialize\n"); + return NULL; + } + if (/* 1MiB/s max send speed */ + (easy_res = curl_easy_setopt(curl_h, CURLOPT_MAX_SEND_SPEED_LARGE, 1024 * 1024)) != CURLE_OK || + /* 1MiB/s max recv speed */ + (easy_res = curl_easy_setopt(curl_h, CURLOPT_MAX_RECV_SPEED_LARGE, 1024 * 1024)) != CURLE_OK || + (easy_res = curl_easy_setopt(curl_h, CURLOPT_TIMEOUT_MS, TIMEOUT_MS)) != CURLE_OK || + (easy_res = curl_easy_setopt(curl_h, CURLOPT_CONNECTTIMEOUT_MS, CONNECT_TIMEOUT_MS)) != CURLE_OK || + (easy_res = curl_easy_setopt(curl_h, CURLOPT_FOLLOWLOCATION, 1)) != CURLE_OK || + (easy_res = curl_easy_setopt(curl_h, CURLOPT_MAXREDIRS, 3)) != CURLE_OK || + (easy_res = curl_easy_setopt(curl_h, CURLOPT_HTTPHEADER, host_entry->headers)) != CURLE_OK || + (easy_res = curl_easy_setopt(curl_h, CURLOPT_PROTOCOLS_STR, "http,https")) != CURLE_OK || + (easy_res = curl_easy_setopt(curl_h, CURLOPT_WRITEFUNCTION, bufwritecb)) != CURLE_OK || + (easy_res = curl_easy_setopt(curl_h, CURLOPT_WRITEDATA, &cbdata->writecb_data)) != CURLE_OK || + (easy_res = curl_easy_setopt(curl_h, CURLOPT_URL, url) != CURLE_OK)) { + error("curl setopt failed: %s (code %d)\n", curl_easy_strerror(easy_res), easy_res); + curl_easy_cleanup(curl_h); + return NULL; + } + + if (!wasrequested && + ((easy_res = curl_easy_setopt(curl_h, CURLOPT_HEADERFUNCTION, headerwritecb)) != CURLE_OK || + (easy_res = curl_easy_setopt(curl_h, CURLOPT_HEADERDATA, &cbdata->headercb_data)) != CURLE_OK)) { + error("curl setopt failed: %s (code %d)\n", curl_easy_strerror(easy_res), easy_res); + curl_easy_cleanup(curl_h); + return NULL; + } + return curl_h; +} + + diff --git a/src/json.c b/src/json.c new file mode 100644 index 0000000..f6e78aa --- /dev/null +++ b/src/json.c @@ -0,0 +1,88 @@ +#include +#include + +#include "json.h" + +jsonval_t json_createobj(jsonkv_dynarr_t pairs) { + jsonkv_dynarr_t *stolen = xmalloc(sizeof(jsonkv_dynarr_t)); + *stolen = pairs; + return (jsonval_t){ .type = JSON_OBJECT, .data = stolen }; +} + +jsonval_t json_createarr(jsonval_dynarr_t elems) { + jsonval_dynarr_t *stolen = xmalloc(sizeof(jsonval_dynarr_t)); + *stolen = elems; + return (jsonval_t){ .type = JSON_ARRAY, .data = stolen }; +} + +jsonval_t json_createstr(const char *str) { + return (jsonval_t){ .type = JSON_STRING, .data = (void*)str }; +} + +jsonval_t json_createint(long num) { + return (jsonval_t){ .type = JSON_INT, .data = (void*)num }; +} + +jsonval_t json_createbool(bool val) { + return (jsonval_t){ .type = JSON_BOOL, .data = (void*)val }; +} + +jsonval_t json_createnull(void) { + return (jsonval_t){ .type = JSON_NULL }; +} + +void json_destroy(jsonval_t *val) { + if (val->type == JSON_OBJECT) { + jsonkv_dynarr_t *arr = (jsonkv_dynarr_t*)val->data; + for (size_t i = 0; i < arr->len; i++) + json_destroy(&arr->data[i].val); + dynarr_destroy(*arr); + xfree(arr); + } + else if (val->type == JSON_ARRAY) { + jsonval_dynarr_t *arr = (jsonval_dynarr_t*)val->data; + for (size_t i = 0; i < arr->len; i++) + json_destroy(&arr->data[i]); + dynarr_destroy(*arr); + xfree(arr); + } +} + +void json_write(FILE *out, jsonval_t *val) { + switch(val->type) { + case JSON_OBJECT: + fprintf(out, "{"); + jsonkv_dynarr_t *map = (jsonkv_dynarr_t*)val->data; + for (size_t i = 0; i < map->len; i++) { + jsonkv_t *pair = &map->data[i]; + fprintf(out, "\"%s\":", pair->key); + json_write(out, &pair->val); + if (i != map->len - 1) + fprintf(out, ","); + } + fprintf(out, "}"); + break; + case JSON_ARRAY: + fprintf(out, "["); + jsonval_dynarr_t *arr = (jsonval_dynarr_t*)val->data; + for (size_t i = 0; i < arr->len; i++) { + json_write(out, &arr->data[i]); + if (i != arr->len - 1) + fprintf(out, ","); + } + fprintf(out, "]"); + break; + case JSON_STRING: + fprintf(out, "\"%s\"", (const char*)val->data); + break; + case JSON_INT: + fprintf(out, "%ld", (long)val->data); + break; + case JSON_BOOL: + fprintf(out, "%s", val->data ? "true" : "false"); + break; + case JSON_NULL: + fprintf(out, "null"); + break; + } +} diff --git a/src/main.c b/src/main.c new file mode 100644 index 0000000..47dbf4a --- /dev/null +++ b/src/main.c @@ -0,0 +1,39 @@ +#include +#include +#include +#include + +#include "crawler.h" +#include "module.h" +#include "util.h" + +const char *allowed_hosts[] = { + "32bit.cafe", + //"en.wikipedia.org", + NULL +}; + +int main(int argc, char **argv) { + if (argc != 2) { + fprintf(stderr, "url dumbass\n"); + return 1; + } + + int seed, rc; + if ((rc = getrandom(&seed, sizeof(seed), 0)) != sizeof(seed)) { + fatal("getrandom() failed with %d", rc); + return 1; + } + srandom(seed); + + crawlerconfig_t config = { .allowedhosts = allowed_hosts, .req_interval_s = 0 }; + curl_global_init(CURL_GLOBAL_DEFAULT); + dynarr_init(moduleentryp_dynarr_t, config.enabledmodules); + for (moduleentry_t *module = availmodules; module->name != NULL; module++) + dynarr_push(config.enabledmodules, module); + crawler(argv[1], &config); + dynarr_destroy(config.enabledmodules); + curl_global_cleanup(); + + return 0; +} diff --git a/src/mod_debug.c b/src/mod_debug.c new file mode 100644 index 0000000..5c90180 --- /dev/null +++ b/src/mod_debug.c @@ -0,0 +1,56 @@ +#include +#include + +#include "util.h" +#include "module.h" + +/* Traverse the document tree */ +void dumpNode(TidyDoc doc, TidyNode tnod, int indent) +{ + TidyNode child; + for(child = tidyGetChild(tnod); child; child = tidyGetNext(child) ) { + ctmbstr name = tidyNodeGetName(child); + if(name) { + /* if it has a name, then it's an HTML tag ... */ + TidyAttr attr; + fprintf(stderr, "%*.*s%s ", indent, indent, "<", name); + /* walk the attribute list */ + for(attr = tidyAttrFirst(child); attr; attr = tidyAttrNext(attr) ) { + fprintf(stderr, "%s", tidyAttrName(attr)); + tidyAttrValue(attr)?fprintf(stderr, "=\"%s\" ", + tidyAttrValue(attr)):fprintf(stderr, " "); + } + fprintf(stderr, ">\n"); + } + else { + /* if it does not have a name, then it's probably text, cdata, etc... */ + TidyBuffer buf; + tidyBufInit(&buf); + tidyNodeGetText(doc, child, &buf); + fprintf(stderr, "%*.*s%s\n", indent, indent, "", buf.bp?(char *)buf.bp:""); + tidyBufFree(&buf); + } + dumpNode(doc, child, indent + 4); /* recursive */ + } +} + +int mod_debug_onpagecomplete(void *userdata, pagecompletedata_t *data) { + fprintf(stderr, "\n-- HTML for %s\n\n", data->url); + fwrite(data->page, 1, data->npage, stderr); + fprintf(stderr, "\n\n-- *CLEANED* HTML for %s\n\n", data->url); + TidyDoc doc = searchextradata(EXTRA_TIDY, "tidyDoc", data->extradata.data, data->extradata.len); + if (doc == NULL) { + error("\"tidyDoc\" entry not found. either mod_tidy failed or is not loaded before"); + return -1; + } + dumpNode(doc, tidyGetRoot(doc), 0); + return 0; +} + +int mod_debug_init(crawlermodule_t *entry) { + *entry = (crawlermodule_t) { + .init = entry->init, + .onpagecomplete = mod_debug_onpagecomplete, + }; + return 0; +} diff --git a/src/mod_pagedata.c b/src/mod_pagedata.c new file mode 100644 index 0000000..b34022b --- /dev/null +++ b/src/mod_pagedata.c @@ -0,0 +1,149 @@ +#include +#include +#include + +#include "json.h" +#include "module.h" +#include "util.h" + +#define MAX_HEADER_SIZE 256 +#define MAX_TITLE_SIZE 256 + +bool getescapedheader(CURL *handle, const char *header, char **escaped) { + struct curl_header *data; + *escaped = NULL; + CURLHcode res = curl_easy_header(handle, header, 0, CURLH_HEADER, -1, &data); + if (res != CURLHE_OK) { + if (res == CURLHE_BADINDEX || res == CURLHE_NOREQUEST || + res == CURLHE_NOREQUEST || res == CURLHE_MISSING) + return true; + error("curl_easy_handle() failed with code %d", res); + return false; + } + int len = strlen(data->value); + if (len > MAX_HEADER_SIZE) { + error("max header size of %d bytes exceeded. header size is %d bytes", MAX_HEADER_SIZE, len); + return false; + } + *escaped = sanitize2ascii_dyn(data->value, MAX_HEADER_SIZE * 4); + if (escaped == NULL) { + error("sanitize2ascii_dyn() failed"); + return false; + } + return true; +} + +char *gettitle(TidyDoc doc, TidyNode node) { + for (TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) { + ctmbstr name = tidyNodeGetName(child); + if (!name) + return NULL; + if (strcmp(name, "title") == 0) { + TidyNode textchild = tidyGetChild(child); // If conforming, should be text + TidyBuffer buf; + tidyBufInit(&buf); + if (!tidyNodeGetText(doc, textchild, &buf)) { + tidyBufFree(&buf); + continue; + } + size_t len = strlen((char*)buf.bp); + // tidy places a newline at the end of a title, so we have to be careful to get rid + // of it + if (len <= 1) { + tidyBufFree(&buf); + continue; + } + char *ret = xmalloc(len+1); + strcpy(ret, (char*)buf.bp); + ret[len-1] = '\0'; + tidyBufFree(&buf); + return ret; + } + char *ret = gettitle(doc, child); + if (ret != NULL) + return ret; + } + return NULL; +} + +void dumpnode(TidyDoc doc, charp_dynarr_t *freearr, jsonkv_dynarr_t *json) { + TidyNode head = tidyGetHead(doc); + if (head) { + char *title = gettitle(doc, head); + if (title != NULL) { + int len = strlen(title); + if (len > MAX_TITLE_SIZE) { + error("max title size of %d bytes exceeded. title size is %d bytes", MAX_TITLE_SIZE, len); + } + else { + char *escaped = sanitize2ascii_dyn(title, MAX_TITLE_SIZE * 4); + if (escaped == NULL) { + error("sanitize2ascii_dyn() failed"); + xfree(escaped); + } + jsonkv_t kv = { .key = "title", .val = json_createstr(escaped) }; + dynarr_push(*json, kv); + dynarr_push(*freearr, escaped); + } + xfree(title); + } + } +} + +int mod_pagedata_onpagecomplete(void *userdata, pagecompletedata_t *data) { + jsonkv_dynarr_t *jsonarr = + searchextradata(EXTRA_JSON, "json", data->extradata.data, data->extradata.len); + if (jsonarr == NULL) { + jsonkv_dynarr_t *arr = xmalloc(sizeof(jsonkv_dynarr_t)); + dynarr_init(jsonkv_dynarr_t, *arr); + extradata_t entry = { + .type = EXTRA_JSON, + .key = "json", + .val = arr, + }; + dynarr_push(data->extradata, entry); + jsonarr = (jsonkv_dynarr_t*)dynarr_get(data->extradata, data->extradata.len-1)->val; + } + + charp_dynarr_t *freearr = userdata; + char *headernames[] = {"Content-Type", "Last-Modified", "ETag"}; + for (size_t i = 0; i < array_size(headernames); i++) { + char *escaped; + if (!getescapedheader(data->handle, headernames[i], &escaped)) { + error("appendheader() failed with header %s", headernames[i]); + continue; + } + if (escaped == NULL) + continue; + dynarr_push(*freearr, escaped); + jsonkv_t kv = { .key = headernames[i], .val = json_createstr(escaped) }; + dynarr_push(*jsonarr, kv); + } + + TidyDoc doc = searchextradata(EXTRA_TIDY, "tidyDoc", data->extradata.data, data->extradata.len); + if (doc == NULL) + debug("url %s contained no tidy data, skipping document parsing...", data->url); + else + dumpnode(doc, freearr, jsonarr); + return 0; +} + +int mod_pagedata_destroy(void *userdata) { + charp_dynarr_t *freearr = userdata; + for (size_t i = 0; i < freearr->len; i++) + xfree(freearr->data[i]); + dynarr_destroy(*freearr); + xfree(freearr); + return 0; +} + +int mod_pagedata_init(crawlermodule_t *entry) { + *entry = (crawlermodule_t) { + .userdata = xmalloc(sizeof(charp_dynarr_t)), + .init = entry->init, + .destroy = mod_pagedata_destroy, + .onpagecomplete = mod_pagedata_onpagecomplete, + }; + *(charp_dynarr_t*)entry->userdata = dynarr_initi(charp_dynarr_t); + return 0; +} diff --git a/src/mod_parse.c b/src/mod_parse.c new file mode 100644 index 0000000..9db17c7 --- /dev/null +++ b/src/mod_parse.c @@ -0,0 +1,214 @@ +#include +#include +#include +#include + +#include "module.h" +#include "util.h" + +#define MAX_LINK_LEN 512 + +bool islinksafe(char c) { + return ('a' <= c && c <= 'z') || + ('A' <= c && c <= 'Z') || + ('0' <= c && c <= '9') || + c == '&' || c == '$' || c == ',' || c == '/' || + c == ':' || c == ';' || c == '=' || c == '?' || + c == '@' || c == '#' || c == '%' || c == '~' || + c == '_' || c == '-' || c == '(' || c == ')' || + c == '.'; +} + +char *relative2absolute(CURLU *curl_url_h, const char *parent, const char *relative) { + CURLUcode url_res; + char *curl_abs_link = NULL; + url_res = curl_url_set(curl_url_h, CURLUPART_URL, parent, 0); + if (url_res != CURLUE_OK) { + char sanitized[100]; + sanitize2ascii(sanitized, relative, sizeof(sanitized)); + error("Parent URL parsing failed for \"%s\": %s", + sanitized, curl_url_strerror(url_res)); + return NULL; + } + url_res = curl_url_set(curl_url_h, CURLUPART_URL, relative, 0); + if (url_res != CURLUE_OK){ + char sanitized[100]; + sanitize2ascii(sanitized, relative, sizeof(sanitized)); + error("Relative URL parsing failed for \"%s\": %s", + sanitized, curl_url_strerror(url_res)); + return NULL; + } + url_res = curl_url_get(curl_url_h, CURLUPART_URL, &curl_abs_link, CURLU_PUNYCODE); + if (url_res != CURLUE_OK) { + error("Full URL parsing failed: %s", curl_url_strerror(url_res)); + return NULL; + } + char *ret = xmalloc(strlen(curl_abs_link) + 1); + strcpy(ret, curl_abs_link); + curl_free(curl_abs_link); + return ret; +} + +size_t geturlcutlen(const char *url, size_t len) { + char *hash = memchr(url, '#', len); + size_t hlen = hash == NULL ? len : hash - url; + char *ques = memchr(url, '?', len); + size_t qlen = ques == NULL ? len : ques - url; + return min(hlen, qlen); +} + +char *parselink(CURLU *curl_url_h, const char *parent, const char *child, size_t nchild) { + size_t cutlen = geturlcutlen(child, nchild); + if (cutlen < 1) + return NULL; + char *linkbuf = xmalloc(cutlen + 1); + memcpy(linkbuf, child, cutlen); + linkbuf[cutlen] = '\0'; + char *abslink = relative2absolute(curl_url_h, parent, linkbuf); + if (abslink != NULL) { + char sanitized[100]; + sanitize2ascii(sanitized, abslink, sizeof(sanitized)); + debug("found link: %s", sanitized); + } + xfree(linkbuf); + return abslink; +} + +int parsehrefs(CURLU *curl_url_h, const char *url, const char *page, size_t npage, charp_dynarr_t *ret) { + const char *href = "href"; + size_t href_len = strlen(href), j = 0, linklen = 0; + int state = 0, linkcnt = 0; + // Probably a really good job for regex but wtvr + for (size_t i = 0; i < npage; i++) { + if (state == 0) { + if (j == href_len) { + state += 1; + i -= 1; + } + if (tolower(page[i]) != href[j]) + j = 0; + } + else if (state == 1 && page[i] == '=') { + state += 1; + } + else if (state == 2 && page[i] == '"') { + state += 1; + linklen = 0; + } + else if (state == 3) { + if (page[i] == '"') { + state = 0; + if (linklen > 0) { + char *link = parselink(curl_url_h, url, page + i - linklen, linklen); + if (link != NULL) { + dynarr_push(*ret, link); + linkcnt++; + } + } + } + if (!islinksafe(page[i]) || linklen >= MAX_LINK_LEN) + state = 0; + linklen += 1; + } + else if (page[i] != ' ' && page[i] != '\t' && page[i] != '\n') { + state = 0; + } + j++; + } + return linkcnt; +} + +char *tolower_s(const char *in) { + size_t len = strlen(in); + char *ret = xmalloc(len+1); + for (size_t i = 0; i < len; i++) + ret[i] = tolower(in[i]); + ret[len] = '\0'; + return ret; +} + +int parseattrs(CURLU *curl_url_h, const char *url, TidyNode node, charp_dynarr_t *links) { + ctmbstr name = tidyNodeGetName(node); + const char *texttags[] = { "p", "t", "span", "a" }; + char *namelower = tolower_s(name); + bool found = false; + for (size_t i = 0; !found && i < array_size(texttags); i++) + if (strcmp(namelower, texttags[i]) == 0) + found = true; + xfree(namelower); + if (!found) + return 0; + // Parse attributes for links + int linkcnt = 0; + for (TidyAttr attr = tidyAttrFirst(node); attr; attr = tidyAttrNext(attr)) { + ctmbstr attrname = tidyAttrName(attr); + if (attrname == NULL) { + char sanitized[100]; + sanitize2ascii(sanitized, url, sizeof(sanitized)); + error("empty attr name for url \"%s\"", sanitized); + continue; + } + char *attrnamelower = tolower_s(attrname); + ctmbstr val = tidyAttrValue(attr); + if (strcmp(attrnamelower, "href") != 0 || val == NULL) { + xfree(attrnamelower); + continue; + } + xfree(attrnamelower); + // extract the link + char *link = parselink(curl_url_h, url, val, strlen(val)); + if (link == NULL) + continue; + dynarr_push(*links, link); + linkcnt++; + } + return linkcnt; +} + +int parsenode(CURLU *curl_url_h, const char *url, TidyNode node, charp_dynarr_t *links) { + int linkcnt = 0; + for (TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) { + ctmbstr name = tidyNodeGetName(child); + if (name == NULL) { + // Node is probably text, for now do nothing + // TODO: Parse plain text links + } + else { + linkcnt += parseattrs(curl_url_h, url, child, links); + } + linkcnt += parsenode(curl_url_h, url, child, links); + } + return linkcnt; +} + +int mod_parse_onpagecomplete(void *userdata, pagecompletedata_t *data) { + CURLU *curl_url_h = userdata; + TidyDoc doc = searchextradata(EXTRA_TIDY, "tidyDoc", data->extradata.data, data->extradata.len); + if (doc == NULL) { + debug("No tidied document found. Parsing hrefs..."); + parsehrefs(curl_url_h, data->url, data->page, data->npage, data->parsedlinks); + } + else { + TidyNode body = tidyGetBody(doc); + if (!body) + return 0; + parsenode(curl_url_h, data->url, body, data->parsedlinks); + } + return 0; +} + +int mod_parse_destroy(void *userdata) { + CURLU *curl_url_h = userdata; + curl_url_cleanup(curl_url_h); + return 0; +} + +int mod_parse_init(crawlermodule_t *entry) { + *entry = (crawlermodule_t) { + .init = entry->init, + .onpagecomplete = mod_parse_onpagecomplete, + .destroy = mod_parse_destroy, + .userdata = curl_url(), + }; + return 0; +} diff --git a/src/mod_robots.c b/src/mod_robots.c new file mode 100644 index 0000000..d105ad8 --- /dev/null +++ b/src/mod_robots.c @@ -0,0 +1,235 @@ +#include +#include +#include +#include +#include + +#include "module.h" +#include "util.h" + +typedef struct hostdata { + char *host; + time_t crawldelay_ms; + charp_dynarr_t prefixes; + struct hostdata *next; + struct timespec lastcrawled; +} hostdata_t; + +typedef struct { + hostdata_t *rules; + CURLU *curl_url_h; +} state_t; + +bool isprefixed(const char *str, const char *prefix) { + size_t prefixlen = strlen(prefix); + return prefixlen > strlen(str) ? false : memcmp(str, prefix, prefixlen) == 0; +} + +void parse_robots_txt(hostdata_t *rules, char *page) { + for (char *ptr = page; *ptr != '\0'; ptr++) + if (*ptr == '\t') + *ptr = ' '; + char *linesave, *line; + bool forus = true; + // page is guarenteed to be terminated by a null byte + while ((line = strtok_r(page, "\n", &linesave)) != NULL) { + page = NULL; + char *comment; + if ((comment = memchr(line, '#', strlen(line))) != NULL) + *comment = '\0'; + char *wssave, *func; + if ((func = strtok_r(line, " ", &wssave)) == NULL) + continue; + char *tmp = xmalloc(strlen(func)+1); + strcpy(tmp, func); + func = tmp; + char *arg; + if ((arg = strtok_r(NULL, " ", &wssave)) == NULL) + continue; + + if (strcmp(func, "User-agent:") == 0) { + forus = strcmp(arg, "*") == 0 || isprefixed(arg, "AvaBot"); + } + else if (strcmp(func, "Disallow:") == 0) { + if (!forus) { + xfree(func); + continue; + } + if (strcmp(arg, "*") == 0) { + dynarr_push(rules->prefixes, "/"); + } + else { + char *buf = xmalloc(strlen(arg)+1); + strcpy(buf, arg); + dynarr_push(rules->prefixes, buf); + } + } + else if (strcmp(func, "Crawl-delay:") == 0) { + if (!forus) { + xfree(func); + continue; + } + rules->crawldelay_ms = (unsigned long)(1000.0*atof(arg))/1000; + } + xfree(func); + } +} + +void robots_txt_cb(void *cbdata, const char *url, char *page, size_t npage, CURL *handle) { + state_t *state = cbdata; + hostdata_t *rules = state->rules; + dynarr_init(charp_dynarr_t, rules->prefixes); + rules->crawldelay_ms = 100; + char *curl_path; + CURLUcode rc; + if ((rc = curl_url_set(state->curl_url_h, CURLUPART_URL, url, 0)) != CURLUE_OK || + (rc = curl_url_get(state->curl_url_h, CURLUPART_PATH, &curl_path, 0)) != CURLUE_OK) { + error("curl_url operation failed: %s (code %d)", curl_url_strerror(rc), rc); + return; + } + char *ctype; + CURLcode ec; + if ((ec = curl_easy_getinfo(handle, CURLINFO_CONTENT_TYPE, &ctype)) != CURLE_OK) { + error("curl_easy_getinfo failed: %s (code %d)", curl_easy_strerror(ec), ec); + return; + } + bool ctypegood = ctype == NULL ? false : isprefixed(ctype, "text/plain"); + if (!ctypegood || page == NULL) { + if (strcmp(curl_path, "robots.txt") == 0) { + char *curl_newurl; + const char *newpath = ".well-known/robots.txt"; + if ((rc = curl_url_set(state->curl_url_h, CURLUPART_PATH, newpath, 0)) != CURLUE_OK || + (rc = curl_url_get(state->curl_url_h, CURLUPART_URL, &curl_newurl, 0)) != CURLUE_OK) { + error("curl_url operation failed: %s (code %d)", curl_url_strerror(rc), rc); + curl_free(curl_path); + return; + } + char *newurl = xmalloc(strlen(curl_newurl)+1); + strcpy(newurl, curl_newurl); + curl_free(curl_newurl); + makerequest(newurl, robots_txt_cb, cbdata); + } + return; + } + parse_robots_txt(rules, page); + xfree(state); // state is cb owned +} + +filterres_t mod_robots_filter(void *userdata, const char *url) { + state_t *state = userdata; + char *curl_host, *curl_path; + CURLUcode rc; + if ((rc = curl_url_set(state->curl_url_h, CURLUPART_URL, url, 0) != CURLUE_OK) || + (rc = curl_url_get(state->curl_url_h, CURLUPART_HOST, &curl_host, 0) != CURLUE_OK) || + (rc = curl_url_get(state->curl_url_h, CURLUPART_PATH, &curl_path, 0) != CURLUE_OK)) { + char sanitized[100]; + sanitize2ascii(sanitized, url, sizeof(sanitized)); + error("curl_url operation failed for \"%s\": %s (code %d)", sanitized, + curl_url_strerror(rc), rc); + return FILTER_PASS; + } + if (curl_host == NULL) { + error("curl_host == NULL"); + return FILTER_PASS; + } + hostdata_t *rules = state->rules, *prev = NULL; + for (; rules != NULL && + strcmp(rules->host, curl_host) != 0; + prev = rules, rules = rules->next) + ; + filterres_t ret; + if (rules == NULL) { + char *curl_url; + if ((rc = curl_url_set(state->curl_url_h, CURLUPART_PATH, "robots.txt", 0) != CURLUE_OK) || + (rc = curl_url_get(state->curl_url_h, CURLUPART_URL, &curl_url, 0) != CURLUE_OK)) { + char sanitized[100]; + sanitize2ascii(sanitized, url, sizeof(sanitized)); + error("curl_url operation failed for \"%s\": %s (code %d)", sanitized, + curl_url_strerror(rc), rc); + ret = FILTER_PASS; + goto cleanup; + } + if (curl_url == NULL) { + error("curl_url == NULL"); + ret = FILTER_PASS; + goto cleanup; + } + char *host = xmalloc(strlen(curl_host)+1); + strcpy(host, curl_host); + hostdata_t *newrules = xmalloc(sizeof(hostdata_t)); + *newrules = (hostdata_t) { .host = host, 0 }; + if (prev == NULL) + state->rules = newrules; + else + prev->next = newrules; + char *url = xmalloc(strlen(curl_url)+1); + strcpy(url, curl_url); + state_t *cbdata = xmalloc(sizeof(state_t)); + cbdata->rules = newrules; + cbdata->curl_url_h = state->curl_url_h; + debug("cbdata = %p", cbdata); + makerequest(url, robots_txt_cb, cbdata); + curl_free(curl_url); + ret = FILTER_STALL; + } + else { + if (rules->prefixes.data == NULL) { + // robots.txt request hasn't finished + ret = FILTER_STALL; + goto cleanup; + } + struct timespec now; + if (clock_gettime(CLOCK_MONOTONIC, &now) < 0) { + char *err = strerror(errno); + error("clock_gettime failed: %s (code %d)", err, errno); + ret = FILTER_STALL; + goto cleanup; + } + time_t diff_ms = now.tv_sec * 1000 + now.tv_nsec / 1000000 - + rules->lastcrawled.tv_sec * 1000 - rules->lastcrawled.tv_nsec / 1000000; + if (diff_ms < rules->crawldelay_ms) { + ret = FILTER_STALL; + goto cleanup; + } + rules->lastcrawled = now; + ret = FILTER_PASS; + for (size_t i = 0; i < rules->prefixes.len; i++) { + if (isprefixed(curl_path, rules->prefixes.data[i])) { + ret = FILTER_REJECT; + break; + } + } + } +cleanup: + curl_free(curl_path); + curl_free(curl_host); + return ret; +} + +int mod_robots_destroy(void *userdata) { + state_t *state = userdata; + curl_url_cleanup(state->curl_url_h); + hostdata_t *cur = state->rules, *next; + for (; cur != NULL; cur = next) { + next = cur->next; + dynarr_destroy(cur->prefixes); + xfree(cur); + } + xfree(state); + return 0; +} + +int mod_robots_init(crawlermodule_t *entry) { + state_t *state = xmalloc(sizeof(state_t)); + *state = (state_t) { + .rules = NULL, + .curl_url_h = curl_url(), + }; + *entry = (crawlermodule_t) { + .userdata = state, + .init = entry->init, + .filter = mod_robots_filter, + .destroy = mod_robots_destroy, + }; + return 0; +} diff --git a/src/mod_tidy.c b/src/mod_tidy.c new file mode 100644 index 0000000..8d828b6 --- /dev/null +++ b/src/mod_tidy.c @@ -0,0 +1,71 @@ +#include +#include +#include "util.h" +#include "module.h" + +int repairdoc(TidyDoc doc, TidyBuffer *docbuf) { + int rc; + rc = tidyParseBuffer(doc, docbuf); + if (rc < 0) { + error("tidyParseBuffer() returned code %d", rc); + return rc; + } + rc = tidyCleanAndRepair(doc); + if (rc < 0) { + error("tidyCleanAndRepair() returned code %d", rc); + return rc; + } + rc = tidyRunDiagnostics(doc); + if (rc < 0) + error("tidyRunDiagnostics() returned code %d", rc); + return rc; +} + +int mod_tidy_onpagecomplete(void *userdata, pagecompletedata_t *data) { + if (searchextradata(EXTRA_TIDY, "tidyDoc", data->extradata.data, data->extradata.len) != NULL) + return -1; + + TidyBuffer origbuf = {0}, errbuf = {0}; + tidyBufAttach(&origbuf, (byte*)data->page, data->npage); + + TidyDoc tdoc = tidyCreate(); + tidyOptSetBool(tdoc, TidyForceOutput, yes); + tidyOptSetInt(tdoc, TidyWrapLen, 4096); + tidySetErrorBuffer(tdoc, &errbuf); + + int rc; + rc = repairdoc(tdoc, &origbuf); + if (rc < 0) + goto cleanup; + + debug("encountered errors in doc: %s", errbuf.bp); + + extradata_t entry = { + .type = EXTRA_TIDY, + .key = "tidyDoc", + .val = (void*)tdoc, + }; + dynarr_push(data->extradata, entry); +cleanup: + tidyBufFree(&errbuf); + // TODO: Safety is unknown + // tdoc still exists while corresponding tidyBuffer is detached + tidyBufDetach(&origbuf); + return rc >= 0 ? 0 : rc; +} + +int mod_tidy_onpagedestroy(void *userdata, pagecompletedata_t *data) { + TidyDoc doc = searchextradata(EXTRA_TIDY, "tidyDoc", data->extradata.data, data->extradata.len); + if (doc != NULL) + tidyRelease(doc); + return 0; +} + +int mod_tidy_init(crawlermodule_t *entry) { + *entry = (crawlermodule_t) { + .init = entry->init, + .onpagecomplete = mod_tidy_onpagecomplete, + .onpagedestroy = mod_tidy_onpagedestroy, + }; + return 0; +} diff --git a/src/module.c b/src/module.c new file mode 100644 index 0000000..d3ba42c --- /dev/null +++ b/src/module.c @@ -0,0 +1,28 @@ +#include + +#include "module.h" + +moduleentry_t availmodules[] = { + (moduleentry_t) { .name = "mod_tidy", .module = { .init = mod_tidy_init } }, + //(moduleentry_t) { .name = "mod_debug", .module = { .init = mod_debug_init } }, + (moduleentry_t) { .name = "mod_pagedata", .module = { .init = mod_pagedata_init } }, + (moduleentry_t) { .name = "mod_parse", .module = { .init = mod_parse_init } }, + (moduleentry_t) { .name = "mod_robots", .module = { .init = mod_robots_init } }, + { 0 }, +}; + +requestedreq_dyanrr_t requestedreqs = { 0 }; + +void *searchextradata(extradata_type_t type, char *key, extradata_t *data, size_t ndata) { + for (size_t i = 0; i < ndata; i++) + if (data[i].type == type && strcmp(data[i].key, key) == 0) + return data[i].val; + return NULL; +} + +void makerequest(const char *url, reqcb_t cb, void *userdata) { + if (requestedreqs.data == NULL) + dynarr_init(requestedreq_dyanrr_t, requestedreqs); + requestedreq_t request = { .url = url, .cb = cb, .userdata = userdata, 0 }; + dynarr_push(requestedreqs, request); +} diff --git a/src/util.c b/src/util.c new file mode 100644 index 0000000..55e3d88 --- /dev/null +++ b/src/util.c @@ -0,0 +1,228 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "util.h" + +const char *last2path(const char *path) { + const char *prev2 = NULL, *prev = NULL, *cur = path; + for(;;) { + prev2 = prev; + prev = cur; + cur = memchr(cur, '/', strlen(cur)); + if (cur++ == NULL) + break; + } + if (prev2 == NULL) + return path; + return prev2; +} + +void volog(loglevel_t level, const char *file, int line, const char *fmt, va_list ap) { + char *cc; + switch (level) { + case LEVEL_DEBUG: + cc = "\x1b[0mDEBUG"; + break; + case LEVEL_INFO: + cc = "\x1b[0mINFO"; + break; + case LEVEL_WARN: + cc = "\x1b[1;35mWARN"; + break; + case LEVEL_ERROR: + cc = "\x1b[1;31mERROR"; + break; + case LEVEL_FATAL: + cc = "\x1b[38;5;124mFATAL"; + break; + } + if (file == NULL) + fprintf(stderr, "%s (unknown file): ", cc); + else + fprintf(stderr, "%s (%s:%d): ", cc, last2path(file), line); + vfprintf(stderr, fmt, ap); + fprintf(stderr, "\x1b[0m\n"); +} + +void olog(loglevel_t level, const char *file, int line, const char *fmt, ...) { + va_list ap; + va_start(ap, fmt); + volog(level, file, line, fmt, ap); + va_end(ap); +} + +void die(const char *fmt, ...) { + va_list ap; + va_start(ap, fmt); + volog(LEVEL_FATAL, NULL, 0, fmt, ap); + va_end(ap); + exit(1); +} + +void *xmalloc(size_t size) { + void *ret = malloc(size); + if (ret == NULL) + die("xmalloc failed: %s", strerror(errno)); + return ret; +} + +void *xcalloc(size_t nmemb, size_t size) { + void *ret = calloc(nmemb, size); + if (ret == NULL) + die("xcalloc failed: %s", strerror(errno)); + return ret; +} + +void *xrealloc(void *ptr, size_t size) { + void *old = ptr; + ptr = realloc(ptr, size); + if (ptr == NULL) + die("xrealloc failed: %s", strerror(errno)); + return ptr; +} + +void xfree(void *ptr) { + free(ptr); +} + +size_t parityhash(const void *data, size_t ndata) { + size_t sum = 0; + for (size_t i = 0; i < ndata; i++) + sum += *((unsigned char*)data + i); + return sum; +} + +char *sanitize2ascii_dyn(const char *inp, size_t maxlen) { + size_t nout = strlen(inp), out_ind = 0; + if (nout > maxlen) + return NULL; + char *out = xmalloc(nout); + char tmp[5]; + size_t inp_ind; + for (inp_ind = 0; inp[inp_ind] != '\0'; inp_ind++) { + if (inp[inp_ind] < 0x20 || inp[inp_ind] > 0x7e) { + if (out_ind + 5 >= maxlen) + break; + if (out_ind + 5 >= nout) { + nout *= 2; + out = xrealloc(out, nout); + } + snprintf(tmp, sizeof(tmp), "\\x%02x", inp[inp_ind]); + memcpy(out + out_ind, tmp, 4); + out_ind += 4; // needs to accomidate "\xXX" + } + else if (inp[inp_ind] == '\\' || inp[inp_ind] == '"') { + if (out_ind + 3 >= maxlen) + break; + if (out_ind + 3 >= nout) { + nout *= 2; + out = xrealloc(out, nout); + } + out[out_ind++] = '\\'; + out[out_ind++] = inp[inp_ind]; + } + else { + if (out_ind + 2 >= maxlen) + break; + if (out_ind + 2 >= nout) { + nout *= 2; + out = xrealloc(out, nout); + } + out[out_ind++] = inp[inp_ind]; + } + } + if (inp[inp_ind] != '\0') { + xfree(out); + return NULL; + } + out[out_ind] = '\0'; + return out; +} + +size_t sanitize2ascii(char *out, const char *inp, size_t outsize) { + size_t out_ind = 0; + char tmp[5]; + for (size_t inp_ind = 0; inp[inp_ind] != '\0'; inp_ind++) { + if (inp[inp_ind] < 0x20 || inp[inp_ind] > 0x7e) { + if (out_ind + 5 >= outsize) + break; + snprintf(tmp, sizeof(tmp), "\\x%02x", inp[inp_ind]); + memcpy(out + out_ind, tmp, 4); + out_ind += 4; // needs to accomidate "\xXX" + } + else if (inp[inp_ind] == '\\' || inp[inp_ind] == '"') { + if (out_ind + 3 >= outsize) + break; + out[out_ind++] = '\\'; + out[out_ind++] = inp[inp_ind]; + } + else { + if (out_ind + 2 >= outsize) + break; + out[out_ind++] = inp[inp_ind]; + } + } + out[out_ind] = '\0'; + return out_ind; +} + +// This is the sample c implementation of MurmurHash taken from Wikipedia. Credit to the Wikipedia article +// of MurmurHash and whoever made the sample implementation on the page. +// https://en.wikipedia.org/w/index.php?title=MurmurHash&oldid=1218923262 accessed on 2024-06-02T18+00:00 +// ----- BEGIN WIKIPEDIA SAMPLE CODE ----- +static inline uint32_t murmur_32_scramble(uint32_t k) { + k *= 0xcc9e2d51; + k = (k << 15) | (k >> 17); + k *= 0x1b873593; + return k; +} +uint32_t murmur3_32(const uint8_t* key, size_t len, uint32_t seed) +{ + uint32_t h = seed; + uint32_t k; + /* Read in groups of 4. */ + for (size_t i = len >> 2; i; i--) { + // Here is a source of differing results across endiannesses. + // A swap here has no effects on hash properties though. + memcpy(&k, key, sizeof(uint32_t)); + key += sizeof(uint32_t); + h ^= murmur_32_scramble(k); + h = (h << 13) | (h >> 19); + h = h * 5 + 0xe6546b64; + } + /* Read the rest. */ + k = 0; + for (size_t i = len & 3; i; i--) { + k <<= 8; + k |= key[i - 1]; + } + // A swap is *not* necessary here because the preceding loop already + // places the low bytes in the low places according to whatever endianness + // we use. Swaps only apply when the memory is copied in a chunk. + h ^= murmur_32_scramble(k); + /* Finalize. */ + h ^= len; + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + return h; +} +// ----- END WIKIPEDIA SAMPLE CODE ----- + +bool hset_charp_cmp(char **lhs, char **rhs) { + return strcmp(*lhs, *rhs) == 0; +} + +size_t hset_charp_hash(char **str) { + // u32 -> u64? + return (size_t)murmur3_32(*str, strlen(*str), 0x9747b28c); +} diff --git a/tests/deque_pop.c b/tests/deque_pop.c new file mode 100644 index 0000000..6a0d2be --- /dev/null +++ b/tests/deque_pop.c @@ -0,0 +1,45 @@ +#include + +#include "util.h" +#include "unit.h" + +#define TEST_LEN 32768 +#define SIZE_MAX (size_t)-1 + +int tests_deque_pop(int argc, char **argv) { + int *expected_front = xmalloc(TEST_LEN / 2 * sizeof(int)); + int *expected_back = xmalloc(TEST_LEN / 2 * sizeof(int)); + for (size_t i = 0; i < TEST_LEN / 2; i++) + expected_front[i] = rand(); + for (size_t i = 0; i < TEST_LEN / 2; i++) + expected_back[i] = rand(); + int_deque_t deq; + deque_init(int_deque_t, deq); + for (size_t i = 0; i < TEST_LEN / 2; i++) { + if (rand() % 2 == 0) { + deque_push_back(deq, expected_back[i]); + deque_push_front(deq, expected_front[TEST_LEN / 2 - i - 1]); + } + else { + deque_push_front(deq, expected_front[TEST_LEN / 2 - i - 1]); + deque_push_back(deq, expected_back[i]); + } + } + int_deque_t from_back, from_front; + deque_clone(from_back, deq); + deque_clone(from_front, deq); + deque_destroy(deq); + for (size_t i = 0; i < 2 * (TEST_LEN / 2); i++) { + int exp = (i >= TEST_LEN / 2) ? expected_back[i - TEST_LEN / 2] : expected_front[i]; + chi_assert("from_front value doesnt match", deque_pop_front(from_front) == exp); + } + for (size_t i = 2 * (TEST_LEN / 2) - 1; i != SIZE_MAX; i--) { + int exp = (i >= TEST_LEN / 2) ? expected_back[i - TEST_LEN / 2] : expected_front[i]; + chi_assert("from_back value doesnt match", deque_pop_back(from_back) == exp); + } + deque_destroy(from_front); + deque_destroy(from_back); + xfree(expected_back); + xfree(expected_front); + return 0; +} diff --git a/tests/deque_push.c b/tests/deque_push.c new file mode 100644 index 0000000..70f66ba --- /dev/null +++ b/tests/deque_push.c @@ -0,0 +1,58 @@ +#include "util.h" +#include "unit.h" + +#define SIZE_MAX (size_t)-1 + +int tests_deque_push(int argc, char **argv) { + int_deque_t deq; + + // push back and front + deque_init(int_deque_t, deq); + deque_push_back(deq, 10); + chi_assert("incorrect back value", deq.base[0] == 10); + deque_push_front(deq, 69); + chi_assert("incorrect back value", deq.base[0] == 10); + chi_assert("incorrect front value", deq.base[deq.cap-1] == 69); + for(size_t i = 0; i < DEQUE_INIT_CAP - 2; i++) + deque_push_front(deq, i); + chi_assert("length and/or capacity incorrect", deq.len == deq.cap && deq.cap == DEQUE_INIT_CAP); + chi_assert("incorrect back value", deq.base[0] == 10); + chi_assert("incorrect front value ", deq.base[deq.cap-1] == 69); + chi_assert("incorrect back", deq.back == 1); + chi_assert("incorrect front", deq.front == 1); + for (size_t i = 2; i < deq.cap - 1; i++) + chi_assert("incorrect ordering", deq.base[i-1] > deq.base[i]); + deque_push_back(deq, 0xee); + chi_assert("length and/or capacity incorrect correct", + deq.len == DEQUE_INIT_CAP + 1 && deq.cap == DEQUE_INIT_CAP * 2); + chi_assert("incorrect back value 1", deq.base[deq.len - 1] == 0xee); + chi_assert("incorrect back value 2", deq.base[deq.len - 2] == 10); + chi_assert("incorrect front value", deq.base[deq.len - 3] == 69); + chi_assert("incorrect back", deq.back == DEQUE_INIT_CAP + 1); + chi_assert("incorrect front", deq.front == 0); + for (size_t i = 1; i < deq.len - 3; i++) + chi_assert("incorrect ordering", deq.base[i-1] > deq.base[i]); + deque_destroy(deq); + + // push front + deque_init(int_deque_t, deq); + for (size_t i = 4 * DEQUE_INIT_CAP - 1; i != SIZE_MAX; i--) + deque_push_front(deq, i); + chi_assert("incorrect back and/or front", deq.back == deq.front && deq.back == DEQUE_INIT_CAP * 2); + chi_assert("incorrect length and/or capcity", deq.len == deq.cap && deq.cap == 4 * DEQUE_INIT_CAP); + for (size_t i = 0; i < deq.cap; i++) + chi_assert("incorrect value", *deque_get(deq, i) == i); + deque_destroy(deq); + + // push back + deque_init(int_deque_t, deq); + for (size_t i = 0; i < 4 * DEQUE_INIT_CAP; i++) + deque_push_back(deq, i); + chi_assert("incorrect back and/or front", deq.back == deq.front && deq.back == 0); + chi_assert("incorrect length and/or capcity", deq.len == deq.cap && deq.cap == 4 * DEQUE_INIT_CAP); + for (size_t i = 0; i < deq.cap; i++) + chi_assert("incorrect value", *deque_get(deq, i) == i); + deque_destroy(deq); + + return 0; +} diff --git a/tests/dynarr.c.old b/tests/dynarr.c.old new file mode 100644 index 0000000..ed0018e --- /dev/null +++ b/tests/dynarr.c.old @@ -0,0 +1,133 @@ +#include + +#include "util.h" +#include "unit.h" + +#define TEST_LEN 32768 + +void get_test(void) { + int_dynarr_t arr = DYNARR_INIT(int_dynarr_t); + for (int i = 0; i < 100; i++) + DYNARR_PUSH(arr, 0); + chi_assert("get(5) == arr.data + 5", DYNARR_GET(arr, 5) == arr.data + 5); +} + +void extensions_test(void) { + int *cmp = xmalloc(TEST_LEN * sizeof(int)); + for (size_t i = 0; i < TEST_LEN; i++) + cmp[i] = rand(); + + int_dynarr_t extend = DYNARR_INIT(int_dynarr_t); + DYNARR_EXTEND_FIXED(extend, cmp, TEST_LEN); + chi_assert("extend.data != cmp", memcmp(extend.data, cmp, TEST_LEN * sizeof(int)) == 0); + chi_assert("extend.len != TEST_LEN", extend.len == TEST_LEN); + DYNARR_DEINIT(extend); + + int_dynarr_t push = DYNARR_INIT(int_dynarr_t); + for (size_t i = 0; i < TEST_LEN; i++) + DYNARR_PUSH(push, cmp[i]); + chi_assert("push.data != cmp", memcmp(push.data, cmp, TEST_LEN * sizeof(int)) == 0); + chi_assert("push.len != TEST_LEN", push.len == TEST_LEN); + DYNARR_DEINIT(push); + + int_dynarr_t both = DYNARR_INIT(int_dynarr_t); + DYNARR_EXTEND_FIXED(both, cmp, TEST_LEN / 2); + for (size_t i = TEST_LEN / 2; i < TEST_LEN; i++) + DYNARR_PUSH(both, cmp[i]); + chi_assert("both.data != cmp", memcmp(both.data, cmp, TEST_LEN * sizeof(int)) == 0); + chi_assert("both.len != TEST_LEN", both.len == TEST_LEN); + DYNARR_DEINIT(both); + + xfree(cmp); +} + +void insert_test(void) { + size_t_dynarr_t increm = DYNARR_INIT(size_t_dynarr_t); + + for (size_t i = 0; i < TEST_LEN; i += 2) + DYNARR_PUSH(increm, i); + for (size_t i = 1; i < TEST_LEN; i += 2) + DYNARR_INSERT(increm, i, i); + chi_assert("arr.len == TEST_LEN", increm.len == TEST_LEN); + for (size_t i = 0; i < TEST_LEN; i++) + chi_assert("arr[i] == i", *DYNARR_GET(increm, i) == i); + + DYNARR_DEINIT(increm); + + int_dynarr_t randins = DYNARR_INIT(int_dynarr_t); + + long long parity = 0, check = 0; + + for (size_t i = 0; i < TEST_LEN; i++) { + int gen = rand() % 10; + parity += gen; + DYNARR_INSERT(randins, (rand() % (randins.len + 1)), gen); + } + chi_assert("arr.len == TEST_LEN", randins.len == TEST_LEN); + for (size_t i = 0; i < TEST_LEN; i++) + check += *DYNARR_GET(randins, i); + chi_assert("parity == check", parity == check); + + DYNARR_DEINIT(randins); +} + +void check_arr(int check[], size_t check_len) { + int_dynarr_t dyn = DYNARR_INIT(int_dynarr_t); + + DYNARR_EXTEND_FIXED(dyn, check, check_len); + chi_assert("dyn.len == check_len", dyn.len == check_len); + chi_assert("dyn.data == check", memcmp(dyn.data, check, check_len * sizeof(int)) == 0); + + for (size_t i = 0; i < dyn.len; i++) { + size_t ind = rand() % check_len; + DYNARR_REMOVE(dyn, ind); + memmove(check + ind, check + ind + 1, (check_len - ind - 1) * sizeof(int)); + check_len -= 1; + chi_assert("dyn.len == check_len (modified)", dyn.len == check_len); + chi_assert("dyn.data == check (modified)", memcmp(dyn.data, check, check_len * sizeof(int)) == 0); + } +} + +void remove_test(void) { + int_dynarr_t randdel = DYNARR_INIT(int_dynarr_t); + + long long parity = 0; + + for (size_t i = 0; i < TEST_LEN; i++) { + int gen = rand() % 10; + parity += gen; + DYNARR_PUSH(randdel, gen); + } + long long check = parity; + for (size_t i = 0; i < TEST_LEN; i++) { + size_t ind = rand() % randdel.len; + check -= *DYNARR_GET(randdel, ind); + DYNARR_REMOVE(randdel, ind); + chi_assert("randdel.len == TEST_LEN - i - 1", randdel.len == TEST_LEN - i - 1); + } + chi_assert("check == 0", check == 0); + + int *c = xmalloc(TEST_LEN * sizeof(int)); + for (int i = 0; i < TEST_LEN; i++) + c[i] = rand(); + check_arr(c, TEST_LEN); + xfree(c); + + DYNARR_DEINIT(randdel); +} + +int main(int argc, char **argv) { + if (argc != 2) + return 1; + if (strcmp(argv[1], "push") == 0) { + push_test(); + } else if (strcmp(argv[1], "get") == 0) { + get_test(); + } else if (strcmp(argv[1], "extensions") == 0) { + extensions_test(); + } else if (strcmp(argv[1], "insert") == 0) { + insert_test(); + } else if (strcmp(argv[1], "remove") == 0) { + remove_test(); + } +} diff --git a/tests/dynarr_extensions.c b/tests/dynarr_extensions.c new file mode 100644 index 0000000..55ca3d7 --- /dev/null +++ b/tests/dynarr_extensions.c @@ -0,0 +1,34 @@ +#include "util.h" +#include "unit.h" + +#define TEST_LEN 32768 + +int tests_dynarr_extensions(int argc, char **argv) { + int *cmp = xmalloc(TEST_LEN * sizeof(int)); + for (size_t i = 0; i < TEST_LEN; i++) + cmp[i] = rand(); + + int_dynarr_t extend = dynarr_initi(int_dynarr_t); + dynarr_extend_fixed(extend, cmp, TEST_LEN); + chi_assert("extend.data != cmp", memcmp(extend.data, cmp, TEST_LEN * sizeof(int)) == 0); + chi_assert("extend.len != TEST_LEN", extend.len == TEST_LEN); + dynarr_destroy(extend); + + int_dynarr_t push = dynarr_initi(int_dynarr_t); + for (size_t i = 0; i < TEST_LEN; i++) + dynarr_push(push, cmp[i]); + chi_assert("push.data != cmp", memcmp(push.data, cmp, TEST_LEN * sizeof(int)) == 0); + chi_assert("push.len != TEST_LEN", push.len == TEST_LEN); + dynarr_destroy(push); + + int_dynarr_t both = dynarr_initi(int_dynarr_t); + dynarr_extend_fixed(both, cmp, TEST_LEN / 2); + for (size_t i = TEST_LEN / 2; i < TEST_LEN; i++) + dynarr_push(both, cmp[i]); + chi_assert("both.data != cmp", memcmp(both.data, cmp, TEST_LEN * sizeof(int)) == 0); + chi_assert("both.len != TEST_LEN", both.len == TEST_LEN); + dynarr_destroy(both); + + xfree(cmp); + return 0; +} diff --git a/tests/dynarr_get.c b/tests/dynarr_get.c new file mode 100644 index 0000000..0895806 --- /dev/null +++ b/tests/dynarr_get.c @@ -0,0 +1,13 @@ +#include "util.h" +#include "unit.h" + +#define TEST_LEN 32768 + +int tests_dynarr_get(int argc, char **argv) { + int_dynarr_t arr = dynarr_initi(int_dynarr_t); + for (int i = 0; i < 100; i++) + dynarr_push(arr, 0); + chi_assert("get(5) == arr.data + 5", dynarr_get(arr, 5) == arr.data + 5); + dynarr_destroy(arr); + return 0; +} diff --git a/tests/dynarr_get1_death.c b/tests/dynarr_get1_death.c new file mode 100644 index 0000000..371f82a --- /dev/null +++ b/tests/dynarr_get1_death.c @@ -0,0 +1,7 @@ +#include "util.h" + +int tests_dynarr_get1_death(int argc, char **argv) { + int_dynarr_t a = dynarr_initi(int_dynarr_t); + dynarr_get(a, 0); + return 0; +} diff --git a/tests/dynarr_get2_death.c b/tests/dynarr_get2_death.c new file mode 100644 index 0000000..b233c18 --- /dev/null +++ b/tests/dynarr_get2_death.c @@ -0,0 +1,9 @@ +#include "util.h" + +int tests_dynarr_get2_death(int argc, char **argv) { + int_dynarr_t a = dynarr_initi(int_dynarr_t); + for (int i = 0; i < 1000; i++) + dynarr_push(a, i); + dynarr_get(a, a.len); + return 0; +} diff --git a/tests/dynarr_get3_death.c b/tests/dynarr_get3_death.c new file mode 100644 index 0000000..5f7ea2f --- /dev/null +++ b/tests/dynarr_get3_death.c @@ -0,0 +1,10 @@ +#include "util.h" + +int tests_dynarr_get3_death(int argc, char **argv) { + int_dynarr_t a = dynarr_initi(int_dynarr_t); + for (int i = 0; i < 1000; i++) + dynarr_push(a, i); + int k = -1; + dynarr_get(a, k); + return 0; +} diff --git a/tests/dynarr_insert.c b/tests/dynarr_insert.c new file mode 100644 index 0000000..c250169 --- /dev/null +++ b/tests/dynarr_insert.c @@ -0,0 +1,35 @@ +#include "util.h" +#include "unit.h" + +#define TEST_LEN 32768 + +int tests_dynarr_insert(int argc, char **argv) { + size_dynarr_t increm = dynarr_initi(size_dynarr_t); + + for (size_t i = 0; i < TEST_LEN; i += 2) + dynarr_push(increm, i); + for (size_t i = 1; i < TEST_LEN; i += 2) + dynarr_insert(increm, i, i); + chi_assert("arr.len == TEST_LEN", increm.len == TEST_LEN); + for (size_t i = 0; i < TEST_LEN; i++) + chi_assert("arr[i] == i", *dynarr_get(increm, i) == i); + + dynarr_destroy(increm); + + int_dynarr_t randins = dynarr_initi(int_dynarr_t); + + long long parity = 0, check = 0; + + for (size_t i = 0; i < TEST_LEN; i++) { + int gen = rand() % 10; + parity += gen; + dynarr_insert(randins, (rand() % (randins.len + 1)), gen); + } + chi_assert("arr.len == TEST_LEN", randins.len == TEST_LEN); + for (size_t i = 0; i < TEST_LEN; i++) + check += *dynarr_get(randins, i); + chi_assert("parity == check", parity == check); + + dynarr_destroy(randins); + return 0; +} diff --git a/tests/dynarr_remove.c b/tests/dynarr_remove.c new file mode 100644 index 0000000..74d7164 --- /dev/null +++ b/tests/dynarr_remove.c @@ -0,0 +1,50 @@ +#include "util.h" +#include "unit.h" + +#define TEST_LEN 32768 + +void check_arr(int check[], size_t check_len) { + int_dynarr_t dyn = dynarr_initi(int_dynarr_t); + + dynarr_extend_fixed(dyn, check, check_len); + chi_assert("dyn.len == check_len", dyn.len == check_len); + chi_assert("dyn.data == check", memcmp(dyn.data, check, check_len * sizeof(int)) == 0); + + for (size_t i = 0; i < dyn.len; i++) { + size_t ind = rand() % check_len; + dynarr_remove(dyn, ind); + memmove(check + ind, check + ind + 1, (check_len - ind - 1) * sizeof(int)); + check_len -= 1; + chi_assert("dyn.len == check_len (modified)", dyn.len == check_len); + chi_assert("dyn.data == check (modified)", memcmp(dyn.data, check, check_len * sizeof(int)) == 0); + } +} + +int tests_dynarr_remove(int argc, char **argv) { + int_dynarr_t randdel = dynarr_initi(int_dynarr_t); + + long long parity = 0; + + for (size_t i = 0; i < TEST_LEN; i++) { + int gen = rand() % 10; + parity += gen; + dynarr_push(randdel, gen); + } + long long check = parity; + for (size_t i = 0; i < TEST_LEN; i++) { + size_t ind = rand() % randdel.len; + check -= *dynarr_get(randdel, ind); + dynarr_remove(randdel, ind); + chi_assert("randdel.len == TEST_LEN - i - 1", randdel.len == TEST_LEN - i - 1); + } + chi_assert("check == 0", check == 0); + + int *c = xmalloc(TEST_LEN * sizeof(int)); + for (int i = 0; i < TEST_LEN; i++) + c[i] = rand(); + check_arr(c, TEST_LEN); + xfree(c); + + dynarr_destroy(randdel); + return 0; +} diff --git a/tests/hset_add.c b/tests/hset_add.c new file mode 100644 index 0000000..89f1a87 --- /dev/null +++ b/tests/hset_add.c @@ -0,0 +1,69 @@ +#include +#include + +#include "util.h" +#include "unit.h" + +size_t charp_parityhash(char **ptr) { + return parityhash(*ptr, strlen(*ptr)); +} + +bool charp_cmp(char **lhs, char **rhs) { + return strcmp(*lhs, *rhs) == 0; +} + +int tests_hset_add(int argc, char **argv) { + charp_hset_t hset1; + hset_init(charp_hset_t, hset1, charp_parityhash, charp_cmp); + + char *ptr = "hello"; + chi_assert("\"hello\" should not be apart of the set", !hset_add(hset1, ptr)); + chi_assert("\"hello\" should be apart of the set", hset_add(hset1, ptr)); + + ptr = "ehllo"; + chi_assert("\"ehllo\" should not be apart of the set", !hset_add(hset1, ptr)); + char *heap = xmalloc(strlen(ptr) + 1); + strcpy(heap, ptr); + chi_assert("\"ehllo\" should be apart of the set", hset_add(hset1, heap)); + xfree(heap); + + hset_destroy(hset1); + + charp_hset_t hset2; + hset_init(charp_hset_t, hset2, charp_parityhash, charp_cmp); + +#define MAX_STR_SIZE 3 + ptr = xmalloc(MAX_STR_SIZE + 2); + strcpy(ptr, "0"); + char buf[100]; + for(int len = 1; len < MAX_STR_SIZE + 1;) { + snprintf(buf, sizeof(buf)-1, "\"%s\" shouldn't be apart of the set", ptr); + char *tmp = xmalloc(strlen(ptr) + 1); + strcpy(tmp, ptr); + chi_assert(buf, !hset_add(hset2, tmp)); + int c = 1; + for (int i = len - 1, s = 0; i >= 0; i--) + s = (ptr[i] - '0') + c, c = s > 9, ptr[i] = s % 10 + '0'; + if (c != 0) { + memmove(ptr + 1, ptr, (len++) + 1); + *ptr = '1'; + } + } + strcpy(ptr, "0"); + for(int len = 1; len < MAX_STR_SIZE + 1;) { + snprintf(buf, sizeof(buf)-1, "\"%s\" should be apart of the set", ptr); + chi_assert(buf, hset_find(hset2, ptr)); + chi_assert(buf, hset_add(hset2, ptr)); + int c = 1; + for (int i = len - 1, s = 0; i >= 0; i--) + s = (ptr[i] - '0') + c, c = s > 9, ptr[i] = s % 10 + '0'; + if (c != 0) { + memmove(ptr + 1, ptr, (len++) + 1); + *ptr = '1'; + } + } + xfree(ptr); + hset_destroy(hset2); + + return 0; +} diff --git a/tests/hset_iter.c b/tests/hset_iter.c new file mode 100644 index 0000000..cea8f0b --- /dev/null +++ b/tests/hset_iter.c @@ -0,0 +1,34 @@ +#include +#include + +#include "util.h" +#include "unit.h" + +size_t int_hash(int *ptr) { + return *ptr; +} + +int tests_hset_iter(int argc, char **argv) { + int_hset_t hset1, hset2; + hset_init(int_hset_t, hset1, int_hash, NULL); + hset_init(int_hset_t, hset2, int_hash, NULL); + + for(int i = 0; i < 1000; i++) + hset_add(hset1, i); + + void *saveptr = NULL; + int *data; + while ((data = hset_iter(hset2, saveptr)) != NULL) + hset_add(hset2, *data); + + char msgbuf[100]; + for(int i = 0; i < 1000; i++) { + snprintf(msgbuf, sizeof(msgbuf), "%d not in hset2", i); + chi_assert(msgbuf, hset_find(hset2, i)); + } + + hset_destroy(hset2); + hset_destroy(hset1); + + return 0; +} diff --git a/tests/json_write.c b/tests/json_write.c new file mode 100644 index 0000000..5f8e720 --- /dev/null +++ b/tests/json_write.c @@ -0,0 +1,78 @@ +#include +#include +#include +#include + +#include "json.h" +#include "unit.h" + +#define CMP_STR "{\"string\":\"hello\",\"number\":69,\"boolean\":true,\"null\":" \ + "null,\"null\":null,\"array\":[\"hello\",\"world\",\"it's\",\"max\"," \ + "\"flow\",\"with\",\"ryhmes\",\"so-so\",0,-100,false,null],\"object\":{\"hello\":" \ + "\"dipshit\"}}" + +int tests_json_write(int argc, char **argv) { + jsonkv_dynarr_t map = dynarr_initi(jsonkv_dynarr_t); + jsonkv_t kv; + kv = (jsonkv_t){ .key = "string", .val = json_createstr("hello") }; + dynarr_push(map, kv); + kv = (jsonkv_t){ .key = "number", .val = json_createint(69) }; + dynarr_push(map, kv); + kv = (jsonkv_t){ .key = "boolean", .val = json_createbool(true) }; + dynarr_push(map, kv); + kv = (jsonkv_t){ .key = "null", .val = json_createnull() }; + dynarr_push(map, kv); + dynarr_push(map, kv); + jsonval_dynarr_t subarr = dynarr_initi(jsonval_dynarr_t); + dynarr_push(subarr, json_createstr("hello")); + dynarr_push(subarr, json_createstr("world")); + dynarr_push(subarr, json_createstr("it's")); + dynarr_push(subarr, json_createstr("max")); + dynarr_push(subarr, json_createstr("flow")); + dynarr_push(subarr, json_createstr("with")); + dynarr_push(subarr, json_createstr("ryhmes")); + dynarr_push(subarr, json_createstr("so-so")); + dynarr_push(subarr, json_createint(0)); + dynarr_push(subarr, json_createint(-100)); + dynarr_push(subarr, json_createbool(false)); + dynarr_push(subarr, json_createnull()); + jsonval_t j_subarr = json_createarr(subarr); + kv = (jsonkv_t){ .key = "array", .val = j_subarr }; + dynarr_push(map, kv); + jsonkv_dynarr_t submap = dynarr_initi(jsonkv_dynarr_t); + kv = (jsonkv_t){ .key = "hello", .val = json_createstr("dipshit") }; + dynarr_push(submap, kv); + kv = (jsonkv_t){ .key = "object", .val = json_createobj(submap) }; + dynarr_push(map, kv); + jsonval_t j_map = json_createobj(map); + + int fds[2] = { 11, 12 }; + if (pipe(fds) < 0) { + fprintf(stderr, "pipe() failed: %s\n", strerror(errno)); + return 1; + } + FILE *writer = fdopen(fds[1], "w"); + if (writer == NULL) { + fprintf(stderr, "pipe() failed: %s\n", strerror(errno)); + return 1; + } + json_write(writer, &j_map); + json_destroy(&j_map); + + char buf[1000]; + fflush(writer); + int cnt = read(fds[0], &buf, sizeof(buf)); + if (cnt < 0) { + fprintf(stderr, "read() failed: %s\n", strerror(errno)); + return 1; + } + buf[cnt] = '\0'; + printf("cmp: %s\n", CMP_STR); + printf("buf: %s\n", buf); + chi_assert("test strings do not match", strcmp(buf, CMP_STR) == 0); + + fclose(writer); + close(fds[0]); + + return 0; +} diff --git a/tests/robots_txt.c b/tests/robots_txt.c new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit.h b/tests/unit.h new file mode 100644 index 0000000..0dd525d --- /dev/null +++ b/tests/unit.h @@ -0,0 +1,3 @@ +#include +#include +#define chi_assert(message, test) do { if (!(test)) { fprintf(stderr, "ASSERT FAILED (line %d): %s\n", __LINE__, (message)); exit(1); } } while(0)