From 20c3cd55106adcc3d1404b9be22c96f76e5b6b83 Mon Sep 17 00:00:00 2001
From: Ava Pagefault <ava@avaaa.gay>
Date: Sun, 1 Sep 2024 10:12:11 -0400
Subject: [PATCH] initial commit

---
 .gitignore                |   4 +
 CMakeLists.txt            |  52 +++
 README.md                 |  12 +
 flake.lock                |  27 ++
 flake.nix                 |  56 +++
 include/crawler.h         |  14 +
 include/http.h            |  46 +++
 include/json.h            |  40 ++
 include/module.h          |  82 ++++
 include/util.h            | 358 +++++++++++++++++
 src/crawler.c             | 794 ++++++++++++++++++++++++++++++++++++++
 src/http.c                | 152 ++++++++
 src/json.c                |  88 +++++
 src/main.c                |  39 ++
 src/mod_debug.c           |  56 +++
 src/mod_pagedata.c        | 149 +++++++
 src/mod_parse.c           | 214 ++++++++++
 src/mod_robots.c          | 235 +++++++++++
 src/mod_tidy.c            |  71 ++++
 src/module.c              |  28 ++
 src/util.c                | 228 +++++++++++
 tests/deque_pop.c         |  45 +++
 tests/deque_push.c        |  58 +++
 tests/dynarr.c.old        | 133 +++++++
 tests/dynarr_extensions.c |  34 ++
 tests/dynarr_get.c        |  13 +
 tests/dynarr_get1_death.c |   7 +
 tests/dynarr_get2_death.c |   9 +
 tests/dynarr_get3_death.c |  10 +
 tests/dynarr_insert.c     |  35 ++
 tests/dynarr_remove.c     |  50 +++
 tests/hset_add.c          |  69 ++++
 tests/hset_iter.c         |  34 ++
 tests/json_write.c        |  78 ++++
 tests/robots_txt.c        |   0
 tests/unit.h              |   3 +
 36 files changed, 3323 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 CMakeLists.txt
 create mode 100644 README.md
 create mode 100644 flake.lock
 create mode 100644 flake.nix
 create mode 100644 include/crawler.h
 create mode 100644 include/http.h
 create mode 100644 include/json.h
 create mode 100644 include/module.h
 create mode 100644 include/util.h
 create mode 100644 src/crawler.c
 create mode 100644 src/http.c
 create mode 100644 src/json.c
 create mode 100644 src/main.c
 create mode 100644 src/mod_debug.c
 create mode 100644 src/mod_pagedata.c
 create mode 100644 src/mod_parse.c
 create mode 100644 src/mod_robots.c
 create mode 100644 src/mod_tidy.c
 create mode 100644 src/module.c
 create mode 100644 src/util.c
 create mode 100644 tests/deque_pop.c
 create mode 100644 tests/deque_push.c
 create mode 100644 tests/dynarr.c.old
 create mode 100644 tests/dynarr_extensions.c
 create mode 100644 tests/dynarr_get.c
 create mode 100644 tests/dynarr_get1_death.c
 create mode 100644 tests/dynarr_get2_death.c
 create mode 100644 tests/dynarr_get3_death.c
 create mode 100644 tests/dynarr_insert.c
 create mode 100644 tests/dynarr_remove.c
 create mode 100644 tests/hset_add.c
 create mode 100644 tests/hset_iter.c
 create mode 100644 tests/json_write.c
 create mode 100644 tests/robots_txt.c
 create mode 100644 tests/unit.h

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..c3c62dc
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+build/
+.cache/
+result
+outputs/
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..a097b53
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,52 @@
+cmake_minimum_required(VERSION 3.10)
+
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_C_STANDARD_REQUIRED ON)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON CACHE INTERNAL "")
+
+set(CMAKE_C_FLAGS_RELEASE_INIT "-Wall -Wextra -Wpedantic -Wno-language-extension-token -Wno-gnu-statement-expression-from-macro-expansion")
+set(CMAKE_C_FLAGS_DEBUG_INIT "${CMAKE_C_FLAGS_RELEASE_INIT} -gdwarf-4")
+
+project(Spider2 VERSION 1.0)
+
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
+
+find_package(CURL REQUIRED)
+
+# Main executable
+
+file(GLOB_RECURSE srcFiles src/*.c)
+add_executable(${PROJECT_NAME} ${srcFiles})
+target_link_libraries(${PROJECT_NAME} CURL::libcurl)
+target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
+target_link_options(${PROJECT_NAME} PRIVATE -ltidy)
+
+# Tests
+
+include(CTest)
+
+#file(GLOB_RECURSE testsToRun tests/*.c)
+#list(FILTER srcFiles EXCLUDE REGEX main.c)
+
+set(testsToRun tests/dynarr_extensions.c tests/dynarr_get.c tests/dynarr_get1_death.c tests/dynarr_get2_death.c tests/dynarr_get3_death.c tests/dynarr_insert.c tests/deque_push.c tests/deque_pop.c tests/json_write.c tests/hset_iter.c tests/hset_add.c)
+
+create_test_sourcelist(tests CommonTests.c ${testsToRun})
+add_executable(CommonTests ${tests} src/util.c src/json.c)
+target_link_libraries(CommonTests CURL::libcurl)
+target_include_directories(CommonTests PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
+target_compile_options(CommonTests PRIVATE -gdwarf-4)
+target_link_options(CommonTests PRIVATE -gdwarf-4)
+#target_compile_options(CommonTests PRIVATE -gdwarf-4 -pg)
+#target_link_options(CommonTests PRIVATE -gdwarf-4 -pg)
+
+foreach(testFile IN LISTS testsToRun)
+    get_filename_component(testName ${testFile} NAME_WE)
+    add_test(NAME ${testName} COMMAND CommonTests tests/${testName})
+    string(REGEX MATCH "_death$" shouldDie ${testName})
+    if(shouldDie)
+        set_property(TEST ${testName} PROPERTY WILL_FAIL ON)
+    endif()
+endforeach()
+
+# Install rules
+install(TARGETS Spider2)
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..4ba37fd
--- /dev/null
+++ b/README.md
@@ -0,0 +1,12 @@
+# Shitty http spider in C
+lol
+## Build Instructions
+In order to build this, you need: cmake, curl, tidy, a c compiler, as well as all of the appropriate development header packages.
+```
+mkdir build
+cd build
+cmake -S .. -B . -DCMAKE_BUILD_TYPE=Release
+cmake --build .
+```
+After you built it, the binary should be in `./build/Spider2` relative to the repo root.
+To run the tests, run `ctest` in `./build` relative to the repo root.
diff --git a/flake.lock b/flake.lock
new file mode 100644
index 0000000..852582c
--- /dev/null
+++ b/flake.lock
@@ -0,0 +1,27 @@
+{
+  "nodes": {
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1718350267,
+        "narHash": "sha256-hrf/m9msEun15Vbs8+IOijFe4Sb58KxG/BnDSL9xgZQ=",
+        "owner": "NixOS",
+        "repo": "nixpkgs",
+        "rev": "ecbc30d5ed9f75449233b17d4a4cdeab53af793f",
+        "type": "github"
+      },
+      "original": {
+        "owner": "NixOS",
+        "ref": "release-24.05",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "nixpkgs": "nixpkgs"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}
diff --git a/flake.nix b/flake.nix
new file mode 100644
index 0000000..c6c0777
--- /dev/null
+++ b/flake.nix
@@ -0,0 +1,56 @@
+{
+  inputs.nixpkgs.url = "github:NixOS/nixpkgs/release-24.05";
+  outputs = { self, nixpkgs }: 
+  let 
+    defaultArchs = [ "x86_64-linux" "x86_64-darwin" "aarch64-linux" "aarch64-darwin" ];
+    forAllSystems = nixpkgs.lib.attrsets.genAttrs defaultArchs;
+    nixpkgsFor = forAllSystems (system: import nixpkgs { inherit system; overlays = [ self.overlay ]; });
+  in 
+  {
+    overlay = final: prev: {
+      spider2 = final.stdenv.mkDerivation {
+        name = "Spider2";
+        src = ./.;
+        nativeBuildInputs = with final; [ 
+          cmake 
+          pkg-config
+        ];
+        buildInputs = with final; [ curl html-tidy ];
+        dontUnpack = true;
+        configurePhase = ''
+          runHook preConfigure
+          cmake -S $src -B . -DCMAKE_BUILD_TYPE=DEBUG
+          runHook postConfigure
+        '';
+        buildPhase = ''
+          runHook preBuild
+          cmake --build .
+          runHook postBuild
+        '';
+        installPhase = ''
+          runHook preInstall
+          cmake --install . --prefix $out
+          runHook postInstall
+        '';
+      };
+      spider2WithDebug = final.spider2.overrideAttrs (_: _: {
+        hardeningDisable = [ "all" ];
+        dontStrip = true;
+      });
+    };
+    packages = forAllSystems (system: {
+      default = nixpkgsFor."${system}".spider2;
+      inherit (nixpkgsFor."${system}") spider2WithDebug;
+    });
+    devShells = forAllSystems (system: 
+    let
+      pkgs = nixpkgsFor."${system}";
+      debugPkgs = with pkgs; [ clang-tools valgrind gdb ];
+    in
+    {
+      default = pkgs.spider2WithDebug.overrideAttrs (finalAttrs: previousAttrs: {
+        nativeBuildInputs = previousAttrs.nativeBuildInputs ++ debugPkgs;
+      });
+    });
+  };
+}
diff --git a/include/crawler.h b/include/crawler.h
new file mode 100644
index 0000000..8b444a5
--- /dev/null
+++ b/include/crawler.h
@@ -0,0 +1,14 @@
+#ifndef __CRAWLER_H_
+#define __CRAWLER_H_
+
+#include "module.h"
+
+typedef struct {
+    const char **allowedhosts;
+    double req_interval_s;
+    moduleentryp_dynarr_t enabledmodules;
+} crawlerconfig_t;
+
+void crawler(const char *seed, const crawlerconfig_t *config);
+
+#endif
diff --git a/include/http.h b/include/http.h
new file mode 100644
index 0000000..c5b1de7
--- /dev/null
+++ b/include/http.h
@@ -0,0 +1,46 @@
+#ifndef __HTTP_H_
+#define __HTTP_H_
+
+#include <curl/curl.h>
+#include <stdbool.h>
+
+#include "util.h"
+#include "module.h"
+
+typedef struct {
+    int status, flags, num_requests;
+} headercb_data_t;
+
+typedef struct {
+    byte *base, *begin, *end;
+    moduleentryp_dynarr_t *modules;
+    const char *url;
+    bool wasrequested;
+} writecb_data_t;
+
+typedef struct {
+    writecb_data_t writecb_data;
+    headercb_data_t headercb_data;
+} cbdata_t;
+
+#define HEADERCB_VALID_MIME (1 << 0)
+#define HEADERCB_CONTENT_TYPE_ENCOUNTERED (1 << 1)
+
+headercb_data_t http_get_to_buf(CURLcode *res, CURL *curl, byte **cnt, size_t *cntlen);
+char *relative2absolute(CURLU *curl_url_h, const char *parent, const char *relative);
+charp_dynarr_t parsehrefs(CURLU *curl_url_h, const char *url, const char *page, size_t pagelen);
+
+extern const char *useragents[];
+
+typedef struct {
+    const char *host;
+    struct curl_slist *headers;
+    int totalfailurecnt, failurecnt, visitcnt;
+} hostentry_t;
+
+dynarr_def(hostentry_t, host_dynarr_t);
+
+CURL *makehandle(const char *url, hostentry_t *host_entry, cbdata_t *cbdata, bool wasrequested);
+void initcbdata(const char *url, moduleentryp_dynarr_t *modules, cbdata_t *data);
+
+#endif
diff --git a/include/json.h b/include/json.h
new file mode 100644
index 0000000..0cce411
--- /dev/null
+++ b/include/json.h
@@ -0,0 +1,40 @@
+#ifndef __JSON_H_
+#define __JSON_H_
+
+#include <stdio.h>
+#include <stdbool.h>
+
+#include "util.h"
+
+typedef enum {
+    JSON_OBJECT,
+    JSON_ARRAY,
+    JSON_STRING,
+    JSON_INT,
+    JSON_BOOL,
+    JSON_NULL,
+} jsontype_t;
+
+typedef struct jsonval {
+    jsontype_t type;
+    void *data;
+} jsonval_t;
+
+typedef struct {
+    const char *key;
+    jsonval_t val;
+} jsonkv_t;
+
+dynarr_def(jsonval_t, jsonval_dynarr_t);
+dynarr_def(jsonkv_t, jsonkv_dynarr_t);
+
+jsonval_t json_createobj(jsonkv_dynarr_t pairs);
+jsonval_t json_createarr(jsonval_dynarr_t elems);
+jsonval_t json_createstr(const char *str);
+jsonval_t json_createint(long num);
+jsonval_t json_createbool(bool val);
+jsonval_t json_createnull(void);
+void json_destroy(jsonval_t *val);
+void json_write(FILE *out, jsonval_t *val);
+
+#endif
diff --git a/include/module.h b/include/module.h
new file mode 100644
index 0000000..9d8094b
--- /dev/null
+++ b/include/module.h
@@ -0,0 +1,82 @@
+#ifndef __MODULE_H_
+#define __MODULE_H_
+
+#include <curl/curl.h>
+
+#include "util.h"
+
+typedef enum {
+    FILTER_PASS,
+    FILTER_STALL,
+    FILTER_REJECT,
+} filterres_t;
+
+typedef enum {
+    EXTRA_JSON,
+    EXTRA_TIDY,
+    EXTRA_OTHER,
+} extradata_type_t;
+
+typedef struct {
+    extradata_type_t type;
+    char *key;
+    void *val;
+} extradata_t;
+
+dynarr_def(extradata_t, extradata_dynarr_t);
+
+typedef struct {
+    const char *url;
+    CURL *handle;
+    char *page;
+    size_t npage;
+    extradata_dynarr_t extradata;
+    charp_dynarr_t *parsedlinks;
+} pagecompletedata_t;
+
+typedef void (*reqcb_t)(void *userdata, const char *url, char *page, size_t npage, CURL *handle);
+
+typedef struct {
+    const char *url;
+    reqcb_t cb;
+    void *userdata;
+    CURL *handle;
+} requestedreq_t;
+
+dynarr_def(requestedreq_t, requestedreq_dyanrr_t);
+
+typedef struct crawlermodule {
+    void *userdata;
+    // `init` will both initialize the module, and populate all other functions in its entry, if necessary.
+    int (*init)(struct crawlermodule *entry);
+    int (*destroy)(void *userdata);
+    int (*onpagewrite)(void *userdata, const char *url, const byte *data, size_t ndata);
+    int (*onpagecomplete)(void *userdata, pagecompletedata_t *data);
+    int (*onpagedestroy)(void *userdata, pagecompletedata_t *data);
+    filterres_t (*filter)(void *userdata, const char *url);
+} crawlermodule_t;
+
+dynarr_def(crawlermodule_t, crawlermodule_dynarr_t);
+dynarr_def(crawlermodule_t *, crawlermodulep_dynarr_t);
+
+typedef struct {
+    const char *name;
+    crawlermodule_t module;
+} moduleentry_t;
+
+dynarr_def(moduleentry_t, moduleentry_dynarr_t);
+dynarr_def(moduleentry_t *, moduleentryp_dynarr_t);
+
+void *searchextradata(extradata_type_t type, char *key, extradata_t *data, size_t ndata);
+void makerequest(const char *url, reqcb_t cb, void *cbdata);
+
+int mod_pagedata_init(crawlermodule_t *entry);
+int mod_tidy_init(crawlermodule_t *entry);
+int mod_debug_init(crawlermodule_t *entry);
+int mod_parse_init(crawlermodule_t *entry);
+int mod_robots_init(crawlermodule_t *entry);
+
+extern requestedreq_dyanrr_t requestedreqs;
+extern moduleentry_t availmodules[];
+
+#endif
diff --git a/include/util.h b/include/util.h
new file mode 100644
index 0000000..4fb04ab
--- /dev/null
+++ b/include/util.h
@@ -0,0 +1,358 @@
+#ifndef __UTIL_H_
+#define __UTIL_H_
+
+#include <stdio.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <string.h>
+#include <stdint.h>
+
+typedef enum {
+    LEVEL_DEBUG,
+    LEVEL_INFO,
+    LEVEL_WARN,
+    LEVEL_ERROR,
+    LEVEL_FATAL,
+} loglevel_t;
+
+void volog(loglevel_t level, const char *file, int line, const char *fmt, va_list ap);
+void olog(loglevel_t level, const char *file, int line, const char *fmt, ...);
+void die(const char *fmt, ...);
+void *xmalloc(size_t size);
+void *xcalloc(size_t nmemb, size_t size);
+void *xrealloc(void *ptr, size_t size);
+void xfree(void *ptr);
+size_t sanitize2ascii(char *out, const char *inp, size_t outsize);
+char *sanitize2ascii_dyn(const char *inp, size_t maxlen);
+
+// wikipedia sample function. read comment in util.c
+uint32_t murmur3_32(const uint8_t* key, size_t len, uint32_t seed);
+
+bool hset_charp_cmp(char **lhs, char **rhs);
+size_t hset_charp_hash(char **str);
+
+#define min(A, B) \
+   ({ __typeof__ (A) _a = (A); \
+       __typeof__ (B) _b = (B); \
+     _a < _b ? _a : _b; })
+
+#define max(A, B) \
+   ({ __typeof__ (A) _a = (A); \
+       __typeof__ (B) _b = (B); \
+     _a > _b ? _a : _b; })
+
+#define DYNARR_INIT_CAP 16
+#define DEQUE_INIT_CAP 16
+#define HSET_INIT_CAP 256
+
+#define debug(...) olog(LEVEL_DEBUG, __FILE__, __LINE__, __VA_ARGS__)
+#define info(...) olog(LEVEL_INFO,__FILE__, __LINE__, __VA_ARGS__)
+#define warn( ...) olog(LEVEL_WARN, __FILE__, __LINE__, __VA_ARGS__)
+#define error(...) olog(LEVEL_ERROR, __FILE__, __LINE__, __VA_ARGS__)
+#define fatal(...) olog(LEVEL_FATAL, __FILE__, __LINE__, __VA_ARGS__)
+
+#define array_size(ARR) (sizeof(ARR) / sizeof(typeof(*ARR)))
+
+#define dynarr_def(T, NAME) typedef DYNARR(T) NAME; typedef T NAME ## _innertype
+
+#define DYNARR(T) struct { \
+        T* data; \
+        size_t len, cap; \
+    }
+
+#define dynarr_initi(ARR_T) (ARR_T){ .data = xmalloc(sizeof(ARR_T ## _innertype) * DYNARR_INIT_CAP), .len = 0, .cap = DYNARR_INIT_CAP }
+#define dynarr_init(ARR_T, ARR) (ARR) = dynarr_initi(ARR_T)
+#define dynarr_destroy(ARR) do { \
+        if ((ARR).data == NULL) \
+            break; \
+        xfree((ARR).data); \
+        (ARR).data = NULL; \
+    } while(0)
+
+#define dynarr_push(ARR, ELEM) do { \
+        if ((ARR).len >= (ARR).cap) { \
+            (ARR).cap *= 2; \
+            (ARR).data = xrealloc((ARR).data, (ARR).cap * sizeof(typeof(ELEM))); \
+        } \
+        (ARR).data[(ARR).len] = (ELEM); \
+        (ARR).len += 1; \
+    } while(0)
+
+#define dynarr_get(ARR, INDEX) ({ \
+        size_t index = (INDEX); \
+        if (index >= (ARR).len)\
+            die("dyn array out of bounds access"); \
+        &(ARR).data[index]; })
+
+#define dynarr_pop(ARR) ({ \
+        if ((ARR).len < 1) \
+            die("dyn array empty array pop"); \
+        (ARR).data[--(ARR).len]; })
+
+#define dynarr_insert(ARR, INDEX, ELEM) do { \
+        size_t index = (INDEX); \
+        if (index > (ARR).len) \
+            die("dyn array out of bounds insert"); \
+        if ((ARR).len >= (ARR).cap) { \
+            (ARR).cap *= 2; \
+            (ARR).data = xrealloc((ARR).data, (ARR).cap * sizeof(ELEM)); \
+        } \
+        memmove((ARR).data + index + 1, (ARR).data + index, ((ARR).len - index) * sizeof(ELEM)); \
+        (ARR).data[index] = (ELEM); \
+        (ARR).len += 1; \
+    } while (0)
+
+#define dynarr_remove(ARR, INDEX) do { \
+        size_t index = INDEX;\
+        if ((index) >= (ARR).len) \
+            die("dyn array out of bounds remove"); \
+        memmove((ARR).data + index, (ARR).data + index + 1, ((ARR).len - index - 1) * sizeof(*(ARR).data)); \
+        (ARR).len -= 1; \
+    } while (0)
+
+#define dynarr_extend_fixed(DYN_ARR, FIXED_ARR, NMEMB) do { \
+        size_t nmemb = NMEMB; \
+        size_t new_cap = (DYN_ARR).cap; \
+        while ((DYN_ARR).len + nmemb > new_cap) \
+            new_cap *= 2; \
+        if (new_cap > (DYN_ARR).cap) { \
+            (DYN_ARR).cap = new_cap; \
+            (DYN_ARR).data = xrealloc((DYN_ARR).data, (DYN_ARR).cap * sizeof(*(DYN_ARR).data)); \
+        } \
+        memcpy((DYN_ARR).data + (DYN_ARR).len, FIXED_ARR, nmemb * sizeof(*(DYN_ARR).data)); \
+        (DYN_ARR).len += nmemb; \
+    } while(0)
+
+#define dynarr_extend_dyn(LHS, RHS) do { \
+        size_t new_cap = (LHS).cap; \
+        while ((LHS).len + (RHS).len > new_cap) \
+            new_cap *= 2; \
+        if (new_cap > (LHS).cap) { \
+            (LHS).cap = new_cap; \
+            (LHS).data = xrealloc((LHS).data, (LHS).cap * sizeof(*(LHS).data)); \
+        } \
+        memcpy((LHS).data + (LHS).len, (RHS).data, (RHS).len * sizeof(*(LHS).data)); \
+        (LHS).len += (RHS).len; \
+    } while(0)
+
+dynarr_def(size_t, size_dynarr_t);
+dynarr_def(int, int_dynarr_t);
+dynarr_def(long, long_dynarr_t);
+dynarr_def(long long, long_long_dynarr_t);
+dynarr_def(char *, charp_dynarr_t);
+dynarr_def(char, char_dynarr_t);
+dynarr_def(void *, vp_dynarr_t);
+
+#define DEQUE(T) struct { \
+        T* base;\
+        size_t cap, front, back, len; \
+    } \
+
+#define deque_def(T, NAME) typedef DEQUE(T) NAME; typedef T NAME ## _innertype
+
+//#define deque_init(DEQ_T) { .base = xmalloc(DEQUE_INIT_CAP * sizeof(DEQ_T ## _innertype)), .cap = DEQUE_INIT_CAP, .front = 0, .back = 0, .len = 0 }
+
+#define deque_init(DEQ_T, DEQ) do { (DEQ).base = xmalloc(DEQUE_INIT_CAP * sizeof(DEQ_T ## _innertype)), (DEQ).cap = DEQUE_INIT_CAP, (DEQ).front = 0, (DEQ).back = 0, (DEQ).len = 0; } while (0)
+
+#define deque_destroy(DEQ) do { \
+        if ((DEQ).base == NULL)\
+            break; \
+        xfree((DEQ).base); \
+        (DEQ).base = NULL; \
+    } while (0)
+
+#define deque_grow(DEQ, NEW_CAP) do { \
+        if ((DEQ).cap >= NEW_CAP) \
+            continue; \
+        size_t new_cap = NEW_CAP, size = sizeof(typeof(*(DEQ).base)); \
+        typeof((DEQ).base) new_base = xmalloc(new_cap * size); \
+        if ((DEQ).len > 0 && (DEQ).front >= (DEQ).back) { \
+            size_t end_len = (DEQ).cap - (DEQ).front; \
+            memcpy(new_base, (DEQ).base + (DEQ).front, end_len * size);\
+            memcpy(new_base + end_len, (DEQ).base, (DEQ).back * size); \
+        } \
+        else { \
+            memcpy(new_base, (DEQ).base + (DEQ).front, (DEQ).len * size); \
+        } \
+        xfree((DEQ).base); \
+        (DEQ).base = new_base, (DEQ).cap = new_cap, (DEQ).front = 0, (DEQ).back = (DEQ).len; \
+    } while (0)
+
+#define deque_push_back(DEQ, ELEM) do { \
+        if ((DEQ).len > 0 && (DEQ).back == (DEQ).front) \
+            deque_grow(DEQ, (DEQ).cap * 2); \
+        (DEQ).base[(DEQ).back] = (ELEM); \
+        (DEQ).back = ((DEQ).back + 1) % (DEQ).cap; \
+        (DEQ).len += 1; \
+    } while (0)
+
+#define deque_pop_back(DEQ) ({ \
+        if ((DEQ).len == 0) \
+            die("deque empty pop back"); \
+        (DEQ).back = (DEQ).back == 0 ? (DEQ).cap - 1 : (DEQ).back - 1; \
+        (DEQ).len -= 1;\
+        (DEQ).base[(DEQ).back]; \
+    })
+
+#define deque_push_front(DEQ, ELEM) do { \
+        if ((DEQ).len > 0 && (DEQ).back == (DEQ).front) \
+            deque_grow(DEQ, (DEQ).cap * 2); \
+        (DEQ).front = (DEQ).front == 0 ? (DEQ).cap - 1 : (DEQ).front - 1; \
+        (DEQ).len += 1; \
+        (DEQ).base[(DEQ).front] = (ELEM); \
+    } while (0)
+
+#define deque_pop_front(DEQ) ({ \
+        if ((DEQ).len == 0) \
+            die("deque empty pop front"); \
+        size_t old_front = (DEQ).front; \
+        (DEQ).front = ((DEQ).front + 1) % (DEQ).cap; \
+        (DEQ).len -= 1;\
+        (DEQ).base[old_front]; \
+    })
+
+#define deque_get(DEQ, INDEX) ({ \
+        size_t index = (INDEX); \
+        if (index >= (DEQ).len) \
+            die("deque out of bounds access"); \
+        &(DEQ).base[((DEQ).front + index) % (DEQ).cap]; \
+    })
+
+#define deque_clone(DST, SRC) ({ \
+        memcpy(&(DST), &(SRC), sizeof(DST)); \
+        (DST).base = xmalloc((SRC).len * sizeof(typeof(*(DST).base))); \
+        memcpy((DST).base, (SRC).base, (SRC).len * sizeof(typeof(*(DST).base))); \
+    })
+
+deque_def(int, int_deque_t);
+deque_def(size_t, size_deque_t);
+deque_def(long, long_deque_t);
+deque_def(long long, longlong_deque_t);
+deque_def(char, char_deque_t);
+deque_def(char *, charp_deque_t);
+deque_def(void *, voidp_deque_t);
+
+size_t parityhash(const void *data, size_t ndata);
+
+#define HSET_BUCKET(T, NAME) struct NAME ## _struct { \
+        struct NAME ## _struct *next; \
+        T data; \
+    } 
+
+#define HSET(T, BUCKET_T) struct { \
+        BUCKET_T** buckets; \
+        size_t nbuckets, len; \
+        size_t (*algo)(T*); \
+        bool (*cmp)(T*, T*); \
+    }
+
+#define hset_def(T, NAME) typedef HSET_BUCKET(T, NAME ## _bucket) NAME ## _bucket; \
+    typedef HSET(T, NAME ## _bucket) NAME; \
+    typedef T NAME ## _innertype
+
+#define hset_initi(HSET_T, ALGO, CMP) (HSET_T) { \
+        .buckets = xcalloc(HSET_INIT_CAP, sizeof(HSET_T ## _bucket *)), \
+        .nbuckets = HSET_INIT_CAP, .len = 0, .algo = (ALGO), .cmp = (CMP), \
+    }
+
+#define hset_init(HSET_T, HSET, ALGO, CMP) (HSET) = hset_initi(HSET_T, ALGO, CMP)
+
+#define hset_destroy(HSET) do { \
+        if ((HSET).buckets == NULL) \
+            break; \
+        for (size_t i = 0; i < (HSET).nbuckets; i++) { \
+            if ((HSET).buckets[i] == NULL) \
+                continue; \
+            typeof(*(HSET).buckets) cur = (HSET).buckets[i], next;\
+            for (; cur != NULL; cur = next) { \
+                next = cur->next; \
+                xfree(cur); \
+            } \
+        } \
+        xfree((HSET).buckets); \
+        (HSET).buckets = NULL; \
+    } while (0)
+
+// c has reminded me of how good i had it with rust
+#define hset_iter(HSET, STATE) ({ \
+        typeof(&(HSET).buckets[0]->data) ret;\
+        do { \
+            if ((STATE) == NULL) {\
+                (STATE) = xmalloc(2 * sizeof(void*)); \
+                ((void**) (STATE))[0] = (HSET).buckets; \
+                ((void**) (STATE))[1] = NULL; \
+            } \
+            typeof(&(HSET).buckets) entryptr = &((typeof(&(HSET).buckets)) (STATE))[0];\
+            typeof((HSET).buckets) bucketptr = &((typeof((HSET).buckets)) (STATE))[1];\
+            if (*bucketptr != NULL && (*bucketptr)->next != NULL) { \
+                *bucketptr = (*bucketptr)->next; \
+                ret = &(*bucketptr)->data; \
+                break; \
+            } \
+            *bucketptr = NULL; \
+            for (; *entryptr < (HSET).buckets + (HSET).nbuckets; (*entryptr)++) \
+                if (**entryptr != NULL) \
+                    break; \
+            if (*entryptr < (HSET).buckets + (HSET).nbuckets) { \
+                *bucketptr = **entryptr; \
+                (*entryptr)++; \
+                ret = &(*bucketptr)->data; \
+            } else { \
+                xfree((STATE)); \
+                (STATE) = NULL; \
+                ret = NULL; \
+            } \
+        } while (0); \
+        ret; \
+    })
+
+#define hset_add(HSET, ELEM) ({ \
+        size_t ind = (HSET).algo(&(ELEM)) % (HSET).nbuckets; \
+        bool ret = false; \
+        if ((HSET).buckets[ind] == NULL) {\
+            (HSET).buckets[ind] = xmalloc(sizeof(typeof(**(HSET).buckets))); \
+            (HSET).buckets[ind]->data = ELEM; \
+            (HSET).buckets[ind]->next = NULL; \
+        } \
+        else { \
+            typeof(*(HSET).buckets) cur, prev; \
+            for (cur = (HSET).buckets[ind]; !ret && cur != NULL; prev = cur, cur = cur->next) { \
+                if ((HSET).cmp == NULL) { \
+                    if(cur->data == (ELEM)) \
+                        ret = true; \
+                } \
+                else if ((*(HSET).cmp)(&cur->data, &(ELEM))) { \
+                    ret = true; \
+                } \
+            } \
+            if (!ret) { \
+                prev->next = xmalloc(sizeof(typeof(**(HSET).buckets))); \
+                prev->next->data = ELEM; \
+                prev->next->next = NULL; \
+            } \
+        } \
+        ret; \
+    })
+
+#define hset_find(HSET, ELEM) ({ \
+        size_t ind = (HSET).algo(&(ELEM)) % (HSET).nbuckets; \
+        bool ret = false; \
+        typeof(*(HSET).buckets) cur; \
+        for (cur = (HSET).buckets[ind]; !ret && cur != NULL; cur = cur->next) { \
+            if ((HSET).cmp == NULL) { \
+                if(cur->data == (ELEM)) \
+                    ret = true; \
+            } \
+            else if ((*(HSET).cmp)(&cur->data, &(ELEM))) { \
+                ret = true; \
+            } \
+        } \
+        ret; \
+    })
+
+hset_def(int, int_hset_t);
+hset_def(char *, charp_hset_t);
+
+typedef unsigned char byte;
+
+#endif
diff --git a/src/crawler.c b/src/crawler.c
new file mode 100644
index 0000000..61318a3
--- /dev/null
+++ b/src/crawler.c
@@ -0,0 +1,794 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include <signal.h>
+#include <time.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/epoll.h>
+#include <curl/curl.h>
+
+#include "curl/multi.h"
+#include "http.h"
+#include "util.h"
+#include "crawler.h"
+#include "json.h"
+#include "module.h"
+
+#define MAX_FAILURE_CNT 5
+#define MAX_CONNECTIONS 32
+#define MAX_CONNECTIONS_PER_HOST 5
+#define EPOLL_BUF_SIZE MAX_CONNECTIONS
+#define DEATH_TIMEOUT_S 30
+
+volatile sig_atomic_t shoulddie = 0;
+volatile struct timespec fatalsigrecved = { 0 };
+
+void fatalsighandler(int status) {
+    struct timespec now;
+    // clock_gettime is safe to call within a signal handler
+    if (clock_gettime(CLOCK_MONOTONIC, &now) < 0)
+        // die calls fprintf which is not safe to call within a signal handler, but we're going
+        // to die anyways so does it really matter?
+        die("clock_gettime failed: %s (code %d)", strerror(errno), errno);
+    if (shoulddie == 0) {
+        fatalsigrecved = now;
+        shoulddie = 1;
+    }
+    if (now.tv_sec - fatalsigrecved.tv_sec > DEATH_TIMEOUT_S)
+        die("death timeout exceeded (%ds)", DEATH_TIMEOUT_S);
+}
+
+int perf_epoll_wait (int __epfd, struct epoll_event *__events,
+		       int __maxevents, int __timeout) {
+    return epoll_wait(__epfd, __events, __maxevents, __timeout);
+}
+
+CURLMcode perf_curl_multi_socket_action(CURLM *multi_handle,
+                                               curl_socket_t s,
+                                               int ev_bitmask,
+                                               int *running_handles) {
+    return curl_multi_socket_action(multi_handle, s, ev_bitmask, running_handles);
+}
+
+bool is_allowed_host(const char *host, const char **allowed_hosts) {
+    if (allowed_hosts == NULL)
+        return true;
+    for (int i = 0; allowed_hosts[i] != NULL; i++)
+        if (strcmp(host, allowed_hosts[i]) == 0)
+            return true;
+    return false;
+}
+
+CURLUcode append2tocrawl(CURLU *curl_url_h, char *link, const char **allowed_hosts, charp_deque_t *to_crawl) {
+    CURLUcode url_res;
+    url_res = curl_url_set(curl_url_h, CURLUPART_URL, link, 0);
+    if (url_res != CURLUE_OK){
+        char sanitized[100];
+        sanitize2ascii(sanitized, link, sizeof(sanitized));
+        error("URL parsing failed for \"%s\": %s (code %d)", 
+                sanitized, curl_url_strerror(url_res), url_res);
+        return url_res;
+    } 
+    char *curl_link_host = NULL;
+    url_res = curl_url_get(curl_url_h, CURLUPART_HOST, &curl_link_host, CURLU_PUNYCODE);
+    if (url_res != CURLUE_OK) {
+        error("URL host parsing failed: %s (code %d)", curl_url_strerror(url_res), url_res);
+        return url_res;
+    }
+    if (!is_allowed_host(curl_link_host, allowed_hosts)) {
+        char sanitized[100];
+        sanitize2ascii(sanitized, link, sizeof(sanitized));
+        error("URL not in allowed hosts \"%s\", not crawling", sanitized);
+        curl_free(curl_link_host);
+        return CURLUE_BAD_HOSTNAME;
+    }
+    curl_free(curl_link_host);
+    deque_push_back(*to_crawl, link);
+    return CURLUE_OK;
+}
+
+#define MAX_SOCKETS_PER_TRANS 4
+
+typedef struct transfer {
+    struct transfer *next;
+    CURL *handle;
+    cbdata_t cbdata;
+    char *url;
+    int hostentryind;
+} transfer_t;
+
+typedef struct {
+    const crawlerconfig_t *config;
+    charp_deque_t to_crawl;
+    charp_dynarr_t visited;
+    host_dynarr_t hostentries;
+    transfer_t *headtransfer;
+    int epollfd;
+    long timeout_ms;
+    CURLM *curl_multi_h;
+    int_dynarr_t sockets;
+    moduleentryp_dynarr_t modules;
+} crawlerstate_t;
+
+typedef enum {
+    TRANS_OK,
+    TRANS_ERROR,
+    TRANS_VISITED,
+    TRANS_HOST_ERROR_EXCEEDED,
+    TRANS_HOST_ERROR,
+    TRANS_WRITE_ERR,
+    TRANS_FATAL,
+} transfercode_t;
+
+int socketcb(CURL *easy, curl_socket_t s, int what, crawlerstate_t *clientp, transfer_t *socketp) {
+    //printf("socketcb(%p, %d, %d, %p, %p)\n", easy, s, what, (void*)clientp, (void*)socketp);
+    struct epoll_event event = { .data = { .fd = s } };
+    int sockind = -1;
+    for (size_t i = 0; i < clientp->sockets.len; i++) {
+        if (s == clientp->sockets.data[i]) {
+            sockind = i;
+            break;
+        }
+    }
+    switch (what) {
+    case CURL_POLL_NONE:
+        return 0;
+    case CURL_POLL_IN:
+        event.events = EPOLLIN;
+        break;
+    case CURL_POLL_OUT:
+        event.events = EPOLLOUT;
+        break;
+    case CURL_POLL_INOUT:
+        event.events = EPOLLIN | EPOLLOUT;
+        break;
+    case CURL_POLL_REMOVE:
+        if (sockind < 0) {
+            fatal("invalid socket specified!");
+            return -1;
+        }
+        if (epoll_ctl(clientp->epollfd, EPOLL_CTL_DEL, s, NULL) < 0) {
+            fatal("epoll_ctl failed: %s (code %d)", strerror(errno), errno);
+            return -1;
+        }
+        dynarr_remove(clientp->sockets, sockind);
+        return 0;
+    }
+    int op = sockind < 0 ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
+    if (epoll_ctl(clientp->epollfd, op, s, &event) < 0) {
+        fatal("epoll_ctl failed: %s (code %d)", strerror(errno), errno);
+        return -1;
+    }
+    if (op == EPOLL_CTL_ADD)
+        dynarr_push(clientp->sockets, s);
+    return 0;
+}
+
+int timercb(CURLM *multi, long timeout_ms, long *clientp) {
+    *clientp = timeout_ms;
+    return 0;
+}
+
+bool gethostdata(const char *host, const char **avail_uas, host_dynarr_t *host_data, int *hostentryind) {
+    const char *picked_ua = NULL;
+    for (size_t i = 0; i < host_data->len; i++) {
+        if (strcmp(host, host_data->data[i].host) == 0) {
+            *hostentryind = i;
+            return true;
+        }
+    }
+    if (avail_uas == NULL) {
+        *hostentryind = -1;
+        return false;
+    }
+    // pick one
+    size_t navail_uas = 0;
+    for (; avail_uas[navail_uas] != NULL; navail_uas++)
+        ;
+    picked_ua = avail_uas[(size_t) random() % navail_uas];
+    const char prefix[] = "User-Agent: ";
+    char *new_buf = xmalloc(strlen(picked_ua) + sizeof(prefix));
+    *new_buf = '\0';
+    strcpy(new_buf, prefix);
+    strcat(new_buf, picked_ua);
+    struct curl_slist *headers = NULL;
+    if ((headers = curl_slist_append(headers, new_buf)) == NULL) {
+        xfree(new_buf);
+        curl_slist_free_all(headers);
+        *hostentryind = -1;
+        return false;
+    }
+    hostentry_t new_entry = { 
+        .host = host, 
+        .headers = headers, 
+        .totalfailurecnt = 0, 
+        .failurecnt = 0, 
+        .visitcnt = 0 
+    };
+    dynarr_push(*host_data, new_entry);
+    *hostentryind = host_data->len - 1;
+    return false;
+}
+
+bool didvisit(const char *url, charp_dynarr_t *visited) {
+    int lhslen = strlen(url);
+    for (size_t i = 0; i < visited->len; i++) {
+        int rhslen = strlen(visited->data[i]);
+        if (lhslen != rhslen)
+            continue;
+        if (memcmp(url, visited->data[i], lhslen) == 0)
+            return true;
+    }
+    return false;
+}
+
+transfercode_t starttransfer(CURLU *curl_url_h, crawlerstate_t *state, char *url, 
+        transfer_t **transfer_ret, bool wasrequested) {
+    char sanitized[100];
+    if (didvisit(url, &state->visited))
+        // DO NOT free the URL. We are going to use it in our JSON later
+        return TRANS_VISITED;
+    dynarr_push(state->visited, url);
+    sanitize2ascii(sanitized, url, sizeof(sanitized));
+    info("crawling \"%s\"", sanitized);
+
+    // Retrieves handle and host data
+    *transfer_ret = xmalloc(sizeof(transfer_t));
+    memset(*transfer_ret, 0, sizeof(transfer_t));
+    transfer_t *transfer = *transfer_ret;
+    char *curl_host = NULL;
+    CURLUcode url_res;
+    if ((url_res = curl_url_set(curl_url_h, CURLUPART_URL, url, 0)) != CURLUE_OK ||
+        (url_res = curl_url_get(curl_url_h, CURLUPART_HOST, &curl_host, CURLU_PUNYCODE) != CURLUE_OK)) {
+        fatal("URL host parsing failed: %s (code %d)", curl_url_strerror(url_res), url_res);
+        curl_free(curl_host);
+        xfree(transfer);
+        return TRANS_FATAL;
+    }
+    int hostentryind;
+    bool wascached = gethostdata(curl_host, useragents, &state->hostentries, &hostentryind);
+    if (hostentryind < 0) {
+        // host is (should be) in punycode so it's fine if we don't sanitize it
+        fatal("Failed to get host entry for host \"%s\"", curl_host); 
+        curl_free(curl_host);
+        xfree(transfer);
+        return TRANS_FATAL;
+    }
+    if (wascached)
+        curl_free(curl_host);
+    hostentry_t *hostentry = &state->hostentries.data[hostentryind];
+    initcbdata(url, &state->modules, &transfer->cbdata);
+    transfer->hostentryind = hostentryind;
+    transfer->url = url;
+    transfer->handle = makehandle(url, hostentry, &transfer->cbdata, wasrequested);
+    if (transfer->handle == NULL) {
+        fatal("makehandle() failed");
+        xfree(transfer->cbdata.writecb_data.base);
+        xfree(transfer);
+        return TRANS_FATAL;
+    }
+    if (hostentry->failurecnt > MAX_FAILURE_CNT) {
+        error("Max failure count (%d) for host exceeded", MAX_FAILURE_CNT); 
+        curl_easy_cleanup(transfer->handle);
+        xfree(transfer->cbdata.writecb_data.base);
+        xfree(transfer);
+        return TRANS_HOST_ERROR_EXCEEDED;
+    }
+
+    // Adds the transfer to be the multi handle tbh idk what im doing
+    CURLMcode mc;
+    mc = curl_multi_add_handle(state->curl_multi_h, transfer->handle);
+    if (mc) {
+        // erm,what the sigma?
+        fatal("curl_multi_add_handle failed: %s (code %d)", curl_multi_strerror(mc), mc);
+        curl_easy_cleanup(transfer->handle);
+        xfree(transfer->cbdata.writecb_data.base);
+        xfree(transfer);
+        return TRANS_FATAL;
+    }
+
+    // Add the transfer to the end of the transfer list
+    if (state->headtransfer == NULL) {
+        state->headtransfer = transfer;
+    } 
+    else {
+        transfer_t *last = state->headtransfer;
+        for (; last->next != NULL; last = last->next)
+            ;
+        last->next = transfer;
+    }
+
+    return TRANS_OK;
+}
+
+void destroytransfer(CURLM *curl_multi_h, transfer_t *trans) {
+    curl_multi_remove_handle(curl_multi_h, trans->handle);
+    curl_easy_cleanup(trans->handle);
+    if (trans->cbdata.writecb_data.base != NULL)
+        xfree(trans->cbdata.writecb_data.base);
+    xfree(trans);
+}
+
+// TODO: Extract arguments into a struct
+transfercode_t transferfinished(CURLU *curl_url_h, crawlerstate_t *state, 
+        CURLMsg *msg, transfer_t *trans, charp_dynarr_t *links, jsonkv_dynarr_t *extrajson,
+        reqcb_t cb, void *cb_userdata) {
+    CURLcode trans_res = msg->data.result;
+    char sanitized[100];
+    hostentry_t *hostentry = &state->hostentries.data[trans->hostentryind];
+    hostentry->visitcnt++;
+    // Check if transfer went OK
+    if (trans_res != CURLE_OK) {
+        sanitize2ascii(sanitized, trans->url, sizeof(sanitized));
+        transfercode_t ret;
+        headercb_data_t *headerdata;
+        switch (trans_res) {
+        case CURLE_WRITE_ERROR:
+            headerdata = &trans->cbdata.headercb_data;
+            if (!(headerdata->flags & HEADERCB_VALID_MIME))
+                info("request failed to \"%s\": is not of mime type text/html, not crawling", sanitized);
+            else if (headerdata->status != 200 && headerdata->status > 0)
+                info("request failed to \"%s\": returned status code %d", headerdata->status, sanitized);
+            else
+                info("request failed to \"%s\": header parsing error or page too big", sanitized);
+            ret = TRANS_WRITE_ERR;
+            break;
+        case CURLE_REMOTE_ACCESS_DENIED:
+        case CURLE_BAD_CONTENT_ENCODING:
+        case CURLE_PEER_FAILED_VERIFICATION:
+        case CURLE_WEIRD_SERVER_REPLY:
+        case CURLE_BAD_DOWNLOAD_RESUME:
+            hostentry->failurecnt++;
+            hostentry->totalfailurecnt++;
+        case CURLE_RANGE_ERROR:
+        case CURLE_UNSUPPORTED_PROTOCOL:
+        case CURLE_AUTH_ERROR:
+        case CURLE_LOGIN_DENIED:
+        case CURLE_TOO_MANY_REDIRECTS:
+        case CURLE_FILESIZE_EXCEEDED:
+        case CURLE_HTTP2:
+        case CURLE_HTTP3:
+        case CURLE_HTTP2_STREAM:
+        case CURLE_QUIC_CONNECT_ERROR:
+            error("non-fatal error: %s (code %d)", curl_easy_strerror(trans_res), trans_res);
+            ret = TRANS_ERROR;
+            break;
+        case CURLE_SSL_CONNECT_ERROR:
+        case CURLE_COULDNT_RESOLVE_HOST:
+        case CURLE_COULDNT_CONNECT:
+        case CURLE_OPERATION_TIMEDOUT:
+            info("retrying (eventually)...: %s (code %d)", curl_easy_strerror(trans_res), trans_res);
+            // Add to to_crawl and remove from visited so that we retry the transfer (eventually)
+            deque_push_front(state->to_crawl, trans->url);
+            for (size_t i = 0; i < state->visited.len; i++) {
+                // Shallow compare is fine
+                if (state->visited.data[i] == trans->url) {
+                    dynarr_remove(state->visited, i);
+                    break;
+                }
+            }
+            hostentry->failurecnt++;
+            hostentry->totalfailurecnt++;
+            ret = TRANS_HOST_ERROR;
+            break;
+        default:
+            fatal("aborting...: %s (code %d)", curl_easy_strerror(trans_res), trans_res);
+            ret = TRANS_FATAL;
+            break;
+        }
+        return ret;
+    }
+    hostentry->failurecnt = 0;
+
+    // Link aggregation and parsing logic
+    writecb_data_t writecb_data = trans->cbdata.writecb_data; 
+    char *page = (char*)writecb_data.base;
+    size_t npage = writecb_data.begin - writecb_data.base;
+    dynarr_init(charp_dynarr_t, *links);
+    debug("%ld links in %ld bytes ", links->len, npage);
+    pagecompletedata_t moduledata = { 
+            .url = trans->url, 
+            .handle = trans->handle, 
+            .page = page, 
+            .npage = npage, 
+            .parsedlinks = links, 
+    };
+
+    // Call callback and terminate, if provided 
+    if (cb != NULL) {
+        dynarr_init(jsonkv_dynarr_t, *extrajson);
+        cb(cb_userdata, moduledata.url, moduledata.page, moduledata.npage, trans->handle);
+        return TRANS_OK;
+    }
+
+    // Module and JSON stuff
+    dynarr_init(extradata_dynarr_t, moduledata.extradata);
+    int rc;
+    for (size_t i = 0; i < state->modules.len; i++) {
+        if (state->modules.data[i]->module.onpagecomplete != NULL) {
+            rc = state->modules.data[i]->module.onpagecomplete(
+                state->modules.data[i]->module.userdata, &moduledata);
+            if (rc != 0)
+                error("module %s onpagecomplete failed with code %d", state->modules.data[i]->name, rc);
+        }
+    }
+    for (size_t i = 0; i < links->len; i++)
+        append2tocrawl(curl_url_h, links->data[i], state->config->allowedhosts, &state->to_crawl);
+    for (size_t i = state->modules.len - 1; i != SIZE_MAX; i--) {
+        if (state->modules.data[i]->module.onpagedestroy != NULL) {
+            int rc;
+            rc = state->modules.data[i]->module.onpagedestroy(
+                state->modules.data[i]->module.userdata, &moduledata);
+            if (rc != 0)
+                error("module %s onpagedestroy failed with code %d", state->modules.data[i]->name, rc);
+        }
+    }
+    jsonkv_dynarr_t *jsoncand = 
+        searchextradata(EXTRA_JSON, "json", moduledata.extradata.data, moduledata.extradata.len);
+    dynarr_destroy(moduledata.extradata);
+    // TODO: Make it so we don't initialize an empty array on no extrajson
+    if (jsoncand == NULL)
+        dynarr_init(jsonkv_dynarr_t, *extrajson);
+    else
+        *extrajson = *jsoncand;
+    return TRANS_OK;
+}
+
+typedef struct {
+    char *url;
+    charp_dynarr_t links;
+    jsonkv_dynarr_t extrajson;
+} linkentry_t;
+
+dynarr_def(linkentry_t, linkentry_dynarr_t);
+
+charp_dynarr_t getuniqlinks(linkentry_dynarr_t *links) {
+    // Filters for unique links
+    charp_hset_t uniqlinks_hset = hset_initi(charp_hset_t, hset_charp_hash, hset_charp_cmp);
+    for (size_t i = 0; i < links->len; i++)
+        for (size_t j = 0; j < links->data[i].links.len; j++)
+            hset_add(uniqlinks_hset, links->data[i].links.data[j]);
+    charp_dynarr_t uniqlinks = dynarr_initi(charp_dynarr_t);
+    void *saveptr = NULL;
+    char **link;
+    while ((link = hset_iter(uniqlinks_hset, saveptr)) != NULL)
+        dynarr_push(uniqlinks, *link);
+    hset_destroy(uniqlinks_hset);
+    return uniqlinks;
+}
+
+// Does some json stuff idek
+jsonval_t links2json(charp_dynarr_t *uniqlinks, linkentry_dynarr_t *links, host_dynarr_t *hosts) {
+    jsonkv_t kvpair;
+
+    // Adds key/value pair for each host
+    jsonkv_dynarr_t hostmap = dynarr_initi(jsonkv_dynarr_t);
+    for (size_t i = 0; i < hosts->len; i++) {
+        hostentry_t *curhost = &hosts->data[i];
+        jsonkv_dynarr_t entrymap = dynarr_initi(jsonkv_dynarr_t);
+        bool isup = curhost->visitcnt > curhost->totalfailurecnt;
+        kvpair = (jsonkv_t){ .key = "up", .val = json_createbool(isup) };
+        dynarr_push(entrymap, kvpair);
+        kvpair = (jsonkv_t){ .key = "visitcnt", .val = json_createint(curhost->visitcnt) };
+        dynarr_push(entrymap, kvpair);
+        kvpair = (jsonkv_t){ .key = "failurecnt", .val = json_createint(curhost->totalfailurecnt) };
+        dynarr_push(entrymap, kvpair);
+        kvpair = (jsonkv_t){ .key = curhost->host, .val = json_createobj(entrymap) };
+        dynarr_push(hostmap, kvpair);
+    }
+
+    // Adds key/value pair for each visited link
+    jsonkv_dynarr_t linkmap = dynarr_initi(jsonkv_dynarr_t);
+    for (size_t i = 0; i < links->len; i++) {
+        linkentry_t *curentry = &links->data[i];
+        jsonkv_dynarr_t entrymap = dynarr_initi(jsonkv_dynarr_t);
+        jsonval_dynarr_t urlindicies = dynarr_initi(jsonval_dynarr_t);
+        for (size_t j = 0; j < curentry->links.len; j++) {
+            char *cururl = curentry->links.data[j];
+            size_t urlind;
+            for (urlind = 0; urlind < uniqlinks->len && strcmp(uniqlinks->data[urlind], cururl) != 0; urlind++)
+                ;
+            if (urlind >= uniqlinks->len)
+                // Just a baby drinking coffee
+                die("You should never see this message unless i fucked the hash set implementation");
+            jsonval_t urlentry = json_createint(urlind);
+            dynarr_push(urlindicies, urlentry);
+        }
+        kvpair = (jsonkv_t){ .key = "link_indicies", .val = json_createarr(urlindicies) };
+        dynarr_push(entrymap, kvpair);
+        kvpair = (jsonkv_t){ .key = "nlinks", .val = json_createint(urlindicies.len) };
+        dynarr_push(entrymap, kvpair);
+        for (size_t i = 0; i < curentry->extrajson.len; i++)
+            dynarr_push(entrymap, curentry->extrajson.data[i]);
+        kvpair = (jsonkv_t){ .key = curentry->url, .val = json_createobj(entrymap) };
+        dynarr_push(linkmap, kvpair);
+    }
+
+    // Assembles object containing uniqlinks, hostmap, and linkmap
+    jsonkv_dynarr_t parentmap = dynarr_initi(jsonkv_dynarr_t);
+    jsonval_dynarr_t uniqlinks_json = dynarr_initi(jsonval_dynarr_t);
+    for (size_t i = 0; i < uniqlinks->len; i++)
+        dynarr_push(uniqlinks_json, json_createstr(uniqlinks->data[i]));
+    kvpair = (jsonkv_t) { .key = "hosts", .val = json_createobj(hostmap) };
+    dynarr_push(parentmap, kvpair);
+    kvpair = (jsonkv_t) { .key = "urlindicies", .val = json_createarr(uniqlinks_json) };
+    dynarr_push(parentmap, kvpair);
+    kvpair = (jsonkv_t) { .key = "links", .val = json_createobj(linkmap) };
+    dynarr_push(parentmap, kvpair);
+    return json_createobj(parentmap);
+}
+
+void crawler(const char *seed, const crawlerconfig_t *config) {
+    CURLU *curl_url_h = curl_url();
+    if (curl_url_h == NULL) {
+        fatal("CURL URL failed to initialize");
+        return;
+    }
+
+    if ((signal(SIGINT, fatalsighandler) == SIG_ERR)) {
+        //(signal(SIGSEGV, fatalsighandler) == SIG_ERR) ||
+        //(signal(SIGTERM, fatalsighandler) == SIG_ERR)) {
+        fatal("signal() failed: %s (code %d)", strerror(errno), errno);
+        return;
+    }
+
+    crawlerstate_t state = { 0 };
+    state.epollfd = -1; // For cleanup if error
+    deque_init(charp_deque_t, state.to_crawl);
+    dynarr_init(charp_dynarr_t, state.visited);
+    dynarr_init(host_dynarr_t, state.hostentries);
+    dynarr_init(int_dynarr_t, state.sockets);
+    state.config = config;
+
+    char *seed_buf = xmalloc(strlen(seed) + 1);
+    strcpy(seed_buf, seed);
+    deque_push_back(state.to_crawl, seed_buf);
+
+    struct epoll_event *event_buf = xmalloc(sizeof(struct epoll_event) * EPOLL_BUF_SIZE);
+    state.curl_multi_h = curl_multi_init();
+    if (state.curl_multi_h == NULL) {
+        fatal("CURL Multi failed to initialize"); 
+        goto cleanup;
+    }
+    CURLMcode mc;
+    if ((mc = curl_multi_setopt(state.curl_multi_h, CURLMOPT_SOCKETFUNCTION, socketcb)) ||
+        (mc = curl_multi_setopt(state.curl_multi_h, CURLMOPT_SOCKETDATA, &state)) ||
+        (mc = curl_multi_setopt(state.curl_multi_h, CURLMOPT_TIMERFUNCTION, timercb)) ||
+        (mc = curl_multi_setopt(state.curl_multi_h, CURLMOPT_TIMERDATA, &state.timeout_ms)) ||
+        (mc = curl_multi_setopt(state.curl_multi_h, CURLMOPT_MAX_HOST_CONNECTIONS, MAX_CONNECTIONS_PER_HOST)) ||
+        (mc = curl_multi_setopt(state.curl_multi_h, CURLMOPT_MAX_TOTAL_CONNECTIONS, MAX_CONNECTIONS))) {
+        fatal("curl_multi_setopt failed: %s (code %d)", curl_multi_strerror(mc), mc);
+        goto cleanup;
+    }
+
+    state.epollfd = epoll_create1(0);
+    if (state.epollfd < 0) {
+        fatal("epoll_create1 failed: %s (code %d)", strerror(errno), errno);
+        goto cleanup;
+    }
+
+    linkentry_dynarr_t linkentries = dynarr_initi(linkentry_dynarr_t);
+
+    state.modules = config->enabledmodules;
+    for (size_t i = 0; i < state.modules.len; i++) {
+        int (*init)(crawlermodule_t *) = state.modules.data[i]->module.init;
+        memset(&state.modules.data[i]->module, 0, sizeof(crawlermodule_t));
+        state.modules.data[i]->module.init = init;
+        state.modules.data[i]->module.init(&state.modules.data[i]->module);
+    }
+
+    size_t ntransfers = 0;
+    int running_handles = 0;
+    for(;;) {
+        if (shoulddie > 0) {
+            fatal("shoulddie = %d", shoulddie);
+            break;
+        }
+
+        transfer_t *transfer;
+        for (size_t i = 0; i < requestedreqs.len && ntransfers < MAX_CONNECTIONS; i++) {
+            requestedreq_t *request = &requestedreqs.data[i];
+            if (request->handle != NULL)
+                continue;
+            transfercode_t transcode = starttransfer(curl_url_h, &state, 
+                    (char*)request->url, &transfer, true);
+            if (transcode == TRANS_FATAL)
+                break;
+            else if (transcode != TRANS_OK)
+                continue;
+            transfer->cbdata.writecb_data.wasrequested = true;
+            request->handle = transfer->handle;
+            ntransfers++;
+        }
+
+        charp_dynarr_t stalled = dynarr_initi(charp_dynarr_t);
+        for (size_t i = 0; i < state.to_crawl.len && ntransfers < MAX_CONNECTIONS; i++) {
+            char *crawling = deque_pop_front(state.to_crawl);
+            bool passed = true;
+            for (size_t i = 0; i < state.modules.len; i++) {
+                moduleentry_t *entry = state.modules.data[i];
+                if (entry->module.filter == NULL)
+                    continue;
+                filterres_t res = entry->module.filter(entry->module.userdata, crawling);
+                if (res != FILTER_PASS) {
+                    passed = false;
+                    char sanitized[100];
+                    sanitize2ascii(sanitized, crawling, sizeof(sanitized));
+                    if (res == FILTER_STALL) {
+                        //debug("URL \"%s\" was stalled by module %s", sanitized, entry->name);
+                        dynarr_push(stalled, crawling);
+                        break;
+                    }
+                    else if (res == FILTER_REJECT) {
+                        debug("URL \"%s\" was rejected by module %s", sanitized, entry->name);
+                        // Don't add it to the visited list, in case the filter changes its mind
+                        break;
+                    }
+                }
+            }
+            if (!passed)
+                continue;
+            transfercode_t transcode = starttransfer(curl_url_h, &state, crawling, &transfer, false);
+            if (transcode == TRANS_FATAL)
+                break;
+            else if (transcode != TRANS_OK)
+                continue;
+            ntransfers++;
+        }
+        while (stalled.len > 0)
+            deque_push_back(state.to_crawl, dynarr_pop(stalled));
+
+        mc = perf_curl_multi_socket_action(state.curl_multi_h, -1, CURL_SOCKET_TIMEOUT, &running_handles);
+        if (mc) {
+            fatal("curl_multi_socket_action failed: %s (code %d)", curl_multi_strerror(mc), mc);
+            goto cleanup;
+        }
+
+        if (running_handles == 0)
+            continue;
+        
+        // Main add/remove transfer loop
+        int availfds = perf_epoll_wait(state.epollfd, event_buf, EPOLL_BUF_SIZE, state.timeout_ms);
+        if (availfds < 0) {
+            fatal("epoll_wait failed: %s (code %d)", strerror(errno), errno);
+            break;
+        }
+        transfer_t *trans = state.headtransfer;
+        if (trans == NULL && state.to_crawl.len == 0) {
+            info("No more URLs left to crawl");
+            break;
+        }
+
+        // Tell CURL about any action connections
+        if (availfds > 0) {
+            for (int i = 0; i < availfds; i++) {
+                struct epoll_event *connevent = &event_buf[i];
+                int ev_bitmask = 0;
+                // TODO: Check for errors on the descriptor
+                if (connevent->events & EPOLLIN)
+                    ev_bitmask |= CURL_CSELECT_IN;
+                if (connevent->events & EPOLLOUT)
+                    ev_bitmask |= CURL_CSELECT_OUT;
+                CURLMcode mc;
+                mc = perf_curl_multi_socket_action(state.curl_multi_h, connevent->data.fd, 
+                        ev_bitmask, &running_handles);
+                if (mc) {
+                    fatal("curl_multi_socket_action failed: %s (code %d)", curl_multi_strerror(mc), mc);
+                    goto cleanup;
+                }
+            }
+        }
+        else {
+            CURLMcode mc;
+            mc = perf_curl_multi_socket_action(state.curl_multi_h, CURL_SOCKET_TIMEOUT, -1, 
+                    &running_handles);
+            if (mc) {
+                fatal("curl_multi_socket_action failed: %s (code %d)", curl_multi_strerror(mc), mc);
+                break;
+            }
+        }
+        // Process and prunes any finished connections
+        CURLMsg *msg;
+        int nmsgs;
+        while ((msg = curl_multi_info_read(state.curl_multi_h, &nmsgs)) != NULL){
+            // No other message types are currently defined, but in case they are
+            if (msg->msg != CURLMSG_DONE) 
+                continue;
+            transfer_t *trans, *prev;
+            for (trans = state.headtransfer, prev = NULL; 
+                    trans != NULL; 
+                    prev = trans, trans = trans->next
+                )
+                if (trans->handle == msg->easy_handle)
+                    break;
+            if (trans == NULL) {
+                fatal("message handle not found, handle=%p, result=%d", 
+                        (void*)msg->easy_handle, msg->data.result);
+                goto cleanup;
+            }
+
+            // Check if the request was requested
+            size_t requestind;
+            for (requestind = 0; 
+                requestind < requestedreqs.len &&
+                    requestedreqs.data[requestind].handle != trans->handle;
+                requestind++)
+                ;
+            reqcb_t cb = NULL;
+            void *userdata = NULL;
+            if (requestind < requestedreqs.len) {
+                cb = requestedreqs.data[requestind].cb;
+                userdata = requestedreqs.data[requestind].userdata;
+                dynarr_remove(requestedreqs, requestind);
+            }
+            
+            // Handle it and log that we visited it
+            charp_dynarr_t links;
+            jsonkv_dynarr_t extrajson;
+            transfercode_t transcode = transferfinished(curl_url_h, &state, msg, trans, &links, 
+                    &extrajson, cb, userdata);
+            if (cb == NULL && transcode == TRANS_OK) {
+                linkentry_t entry = { .url = trans->url, .links = links, .extrajson = extrajson };
+                dynarr_push(linkentries, entry);
+            }
+
+            // Remove it
+            if (prev == NULL)
+                // Transfer was the head 
+                state.headtransfer = trans->next;
+            else
+                // Transfer was not the head
+                prev->next = trans->next;
+            prev = trans;
+            destroytransfer(state.curl_multi_h, trans);
+            if (transcode == TRANS_FATAL)
+                goto cleanup;
+            ntransfers--;
+        }
+    }
+
+    jsonval_t json;
+    charp_dynarr_t uniqlinks;
+cleanup:
+    // Create and write out all of the json
+    uniqlinks = getuniqlinks(&linkentries);
+    json = links2json(&uniqlinks, &linkentries, &state.hostentries);
+    json_write(stdout, &json);
+    // Destroy all modules first bc they might have some save data
+    for (size_t i = 0; i < state.modules.len; i++)
+        if (state.modules.data[i]->module.destroy != NULL)
+            state.modules.data[i]->module.destroy(state.modules.data[i]->module.userdata);
+    // Destroy all links
+    json_destroy(&json);
+    for (size_t i = 0; i < linkentries.len; i++)
+        dynarr_destroy(linkentries.data[i].links);
+    dynarr_destroy(linkentries);
+    for (size_t i = 0; i < uniqlinks.len; i++)
+        xfree(uniqlinks.data[i]);
+    dynarr_destroy(uniqlinks);
+    // Destroy everything else
+    for (size_t i = 0; i < state.hostentries.len; i++) {
+        curl_slist_free_all(state.hostentries.data[i].headers);
+        curl_free((void*)state.hostentries.data[i].host);
+    }
+    for (transfer_t *trans = state.headtransfer, *next; trans != NULL; trans = next) {
+        next = trans->next;
+        destroytransfer(state.curl_multi_h, trans);
+    }
+    if (state.curl_multi_h != NULL)
+        curl_multi_cleanup(state.curl_multi_h);
+    if (state.epollfd >= 0)
+        close(state.epollfd);
+    xfree(event_buf);
+    curl_url_cleanup(curl_url_h);
+    dynarr_destroy(state.visited);
+    dynarr_destroy(state.hostentries);
+    dynarr_destroy(state.sockets);
+    dynarr_destroy(requestedreqs);
+    deque_destroy(state.to_crawl);
+}
diff --git a/src/http.c b/src/http.c
new file mode 100644
index 0000000..5d9344a
--- /dev/null
+++ b/src/http.c
@@ -0,0 +1,152 @@
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <curl/curl.h>
+
+#include "http.h"
+#include "util.h"
+
+#define INIT_PAGE_SIZE 8192
+#define MAX_PAGE_SIZE 1048576
+#define MEM_INC_FACTOR 2
+
+#define TIMEOUT_MS 10000
+#define CONNECT_TIMEOUT_MS 3000
+
+const char *useragents[] = {
+    "AvaBot",
+    NULL,
+};
+
+size_t bufwritecb(const byte *ptr, size_t size, size_t nmemb, writecb_data_t *userdata) {
+    for (size_t i = 0; i < userdata->modules->len; i++) {
+        if (userdata->modules->data[i]->module.onpagewrite != NULL) {
+            int rc;
+            rc = userdata->modules->data[i]->module.onpagewrite(
+                    userdata->modules->data[i]->module.userdata, userdata->url, ptr, nmemb);
+            if (rc != 0)
+                error("module %s onpagewrite failed with code %d", userdata->modules->data[i]->name);
+        }
+    }
+    size_t len = nmemb;
+    while (userdata->begin + nmemb + 1 > userdata->end) {
+        // Buffer is undersized
+        size_t buf_len = userdata->end - userdata->base;
+        if (buf_len >= MAX_PAGE_SIZE)
+            break;
+        size_t new_buf_len = buf_len * MEM_INC_FACTOR;
+        if (new_buf_len > MAX_PAGE_SIZE)
+            new_buf_len = MAX_PAGE_SIZE;
+        byte *new_base = xrealloc(userdata->base, new_buf_len);
+        userdata->end = new_base + new_buf_len;
+        userdata->begin = new_base + (userdata->begin - userdata->base);
+        userdata->base = new_base;
+    }
+    if (userdata->begin + nmemb + 1 > userdata->end)
+        // Buffer is still undersized
+        len = userdata->end - userdata->begin;
+    memcpy(userdata->begin, ptr, len);
+    userdata->begin[len] = '\0';
+    userdata->begin += len;
+    return len;
+}
+
+bool is_redirect(int status) {
+    return status == 301 || // Moved Permanently
+        status == 302 ||    // Found
+        status == 307 ||    // Temporary Redirect
+        status == 308;      // Permanent Redirect
+}
+
+size_t headerwritecb(const char *buffer, size_t _size, size_t nitems, headercb_data_t *userdata) {
+    const char content_type_str[] = "content-type:", html_mime_str[] = "text/html", http_str[] = "HTTP/";
+    // Parses HTTP status line
+    if (nitems < sizeof(http_str) - 1)
+        return nitems;
+    if (memcmp(buffer, http_str, sizeof(http_str) - 1) == 0) { 
+        // Header is an http status line
+        userdata->num_requests++;
+        const char *status_line = memchr(buffer, ' ', nitems);
+        for (; *status_line == ' '; status_line++) {
+            // Ensures that status_line...(buffer+nitems) can fit a status code (3 numbers)
+            if (status_line > buffer + nitems - 3) {
+                userdata->status = 0;
+                return CURL_WRITEFUNC_ERROR;
+            }
+        }
+        char code_str[4] = { status_line[0], status_line[1], status_line[2], '\0' };
+        userdata->status = atoi(code_str);
+        if (userdata->status == 200 || is_redirect(userdata->status))
+            return nitems;
+        return CURL_WRITEFUNC_ERROR;
+    }
+    if (userdata->status == 0 || is_redirect(userdata->status))
+        return nitems;
+    // Parses Content-Type header
+    if (userdata->flags & HEADERCB_CONTENT_TYPE_ENCOUNTERED)
+        return nitems;
+    // We need an extra byte to plop header_val to the byte after the ':'
+    if (nitems < sizeof(content_type_str))
+        return nitems;
+    for (size_t i = 0; i < sizeof(content_type_str)-1; i++)
+        if (tolower(buffer[i]) != content_type_str[i])
+            return nitems;
+    const char *header_val = buffer + sizeof(content_type_str);
+    for (; *header_val == ' '; header_val++)
+        // Ensures that header_val..(buffer+nitems) can fit "text/html"
+        if (header_val > buffer + nitems - sizeof(html_mime_str) + 1)
+            return nitems;
+    userdata->flags |= HEADERCB_CONTENT_TYPE_ENCOUNTERED;
+    if (memcmp(header_val, html_mime_str, sizeof(html_mime_str)-1) != 0)
+        return CURL_WRITEFUNC_ERROR;
+    userdata->flags |= HEADERCB_VALID_MIME;
+    return nitems;
+}
+
+void initcbdata(const char *url, moduleentryp_dynarr_t *modules, cbdata_t *data) {
+    memset(data, 0, sizeof(cbdata_t));
+    data->writecb_data.base = xmalloc(INIT_PAGE_SIZE);
+    data->writecb_data.begin = data->writecb_data.base;
+    data->writecb_data.end = data->writecb_data.base + INIT_PAGE_SIZE;
+    data->writecb_data.url = url;
+    data->writecb_data.modules = modules;
+}
+
+CURL *makehandle(const char *url, hostentry_t *host_entry, cbdata_t *cbdata, bool wasrequested) {
+    CURL *curl_h = curl_easy_init();
+    CURLcode easy_res;
+    if (curl_h == NULL) {
+        error("curl failed to initialize\n");
+        return NULL;
+    }
+    if (/* 1MiB/s max send speed */
+        (easy_res = curl_easy_setopt(curl_h, CURLOPT_MAX_SEND_SPEED_LARGE, 1024 * 1024)) != CURLE_OK || 
+        /* 1MiB/s max recv speed */
+        (easy_res = curl_easy_setopt(curl_h, CURLOPT_MAX_RECV_SPEED_LARGE, 1024 * 1024)) != CURLE_OK ||
+        (easy_res = curl_easy_setopt(curl_h, CURLOPT_TIMEOUT_MS, TIMEOUT_MS)) != CURLE_OK ||
+        (easy_res = curl_easy_setopt(curl_h, CURLOPT_CONNECTTIMEOUT_MS, CONNECT_TIMEOUT_MS)) != CURLE_OK ||
+        (easy_res = curl_easy_setopt(curl_h, CURLOPT_FOLLOWLOCATION, 1)) != CURLE_OK ||
+        (easy_res = curl_easy_setopt(curl_h, CURLOPT_MAXREDIRS, 3)) != CURLE_OK ||
+        (easy_res = curl_easy_setopt(curl_h, CURLOPT_HTTPHEADER, host_entry->headers)) != CURLE_OK ||
+        (easy_res = curl_easy_setopt(curl_h, CURLOPT_PROTOCOLS_STR, "http,https")) != CURLE_OK ||
+        (easy_res = curl_easy_setopt(curl_h, CURLOPT_WRITEFUNCTION, bufwritecb)) != CURLE_OK ||
+        (easy_res = curl_easy_setopt(curl_h, CURLOPT_WRITEDATA, &cbdata->writecb_data)) != CURLE_OK ||
+        (easy_res = curl_easy_setopt(curl_h, CURLOPT_URL, url) != CURLE_OK)) {
+        error("curl setopt failed: %s (code %d)\n", curl_easy_strerror(easy_res), easy_res);
+        curl_easy_cleanup(curl_h);
+        return NULL;
+    }
+
+    if (!wasrequested &&
+        ((easy_res = curl_easy_setopt(curl_h, CURLOPT_HEADERFUNCTION, headerwritecb)) != CURLE_OK ||
+        (easy_res = curl_easy_setopt(curl_h, CURLOPT_HEADERDATA, &cbdata->headercb_data)) != CURLE_OK)) {
+        error("curl setopt failed: %s (code %d)\n", curl_easy_strerror(easy_res), easy_res);
+        curl_easy_cleanup(curl_h);
+        return NULL;
+    }
+    return curl_h;
+}
+
+
diff --git a/src/json.c b/src/json.c
new file mode 100644
index 0000000..f6e78aa
--- /dev/null
+++ b/src/json.c
@@ -0,0 +1,88 @@
+#include <stdbool.h>
+#include <stdio.h>
+
+#include "json.h"
+
+jsonval_t json_createobj(jsonkv_dynarr_t pairs) {
+    jsonkv_dynarr_t *stolen = xmalloc(sizeof(jsonkv_dynarr_t));
+    *stolen = pairs;
+    return (jsonval_t){ .type = JSON_OBJECT, .data = stolen };
+}
+
+jsonval_t json_createarr(jsonval_dynarr_t elems) {
+    jsonval_dynarr_t *stolen = xmalloc(sizeof(jsonval_dynarr_t));
+    *stolen = elems;
+    return (jsonval_t){ .type = JSON_ARRAY, .data = stolen  };
+}
+
+jsonval_t json_createstr(const char *str) {
+    return (jsonval_t){ .type = JSON_STRING, .data = (void*)str };
+}
+
+jsonval_t json_createint(long num) {
+    return (jsonval_t){ .type = JSON_INT, .data = (void*)num };
+}
+
+jsonval_t json_createbool(bool val) {
+    return (jsonval_t){ .type = JSON_BOOL, .data = (void*)val };
+}
+
+jsonval_t json_createnull(void) {
+    return (jsonval_t){ .type = JSON_NULL };
+}
+
+void json_destroy(jsonval_t *val) {
+    if (val->type == JSON_OBJECT) {
+        jsonkv_dynarr_t *arr = (jsonkv_dynarr_t*)val->data;
+        for (size_t i = 0; i < arr->len; i++)
+            json_destroy(&arr->data[i].val);
+        dynarr_destroy(*arr);
+        xfree(arr);
+    }
+    else if (val->type == JSON_ARRAY) {
+        jsonval_dynarr_t *arr = (jsonval_dynarr_t*)val->data;
+        for (size_t i = 0; i < arr->len; i++)
+            json_destroy(&arr->data[i]);
+        dynarr_destroy(*arr);
+        xfree(arr);
+    }
+}
+
+void json_write(FILE *out, jsonval_t *val) {
+    switch(val->type) {
+    case JSON_OBJECT:
+        fprintf(out, "{");
+        jsonkv_dynarr_t *map = (jsonkv_dynarr_t*)val->data;
+        for (size_t i = 0; i < map->len; i++) {
+            jsonkv_t *pair = &map->data[i];
+            fprintf(out, "\"%s\":", pair->key);
+            json_write(out, &pair->val);
+            if (i != map->len - 1)
+                fprintf(out, ",");
+        }
+        fprintf(out, "}");
+        break;
+    case JSON_ARRAY:
+        fprintf(out, "[");
+        jsonval_dynarr_t *arr = (jsonval_dynarr_t*)val->data;
+        for (size_t i = 0; i < arr->len; i++) {
+            json_write(out, &arr->data[i]);
+            if (i != arr->len - 1)
+                fprintf(out, ",");
+        }
+        fprintf(out, "]");
+        break;
+    case JSON_STRING:
+        fprintf(out, "\"%s\"", (const char*)val->data);
+        break;
+    case JSON_INT:
+        fprintf(out, "%ld", (long)val->data);
+        break;
+    case JSON_BOOL:
+        fprintf(out, "%s", val->data ? "true" : "false");
+        break;
+    case JSON_NULL:
+        fprintf(out, "null");
+        break;
+    }
+}
diff --git a/src/main.c b/src/main.c
new file mode 100644
index 0000000..47dbf4a
--- /dev/null
+++ b/src/main.c
@@ -0,0 +1,39 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <curl/curl.h>
+#include <sys/random.h>
+
+#include "crawler.h"
+#include "module.h"
+#include "util.h"
+
+const char *allowed_hosts[] = {
+    "32bit.cafe",
+    //"en.wikipedia.org",
+    NULL
+};
+
+int main(int argc, char **argv) {
+    if (argc != 2) {
+        fprintf(stderr, "url dumbass\n");
+        return 1;
+    }
+
+    int seed, rc;
+    if ((rc = getrandom(&seed, sizeof(seed), 0)) != sizeof(seed)) {
+        fatal("getrandom() failed with %d", rc);
+        return 1;
+    }
+    srandom(seed);
+    
+    crawlerconfig_t config = { .allowedhosts = allowed_hosts, .req_interval_s = 0 };
+    curl_global_init(CURL_GLOBAL_DEFAULT);
+    dynarr_init(moduleentryp_dynarr_t, config.enabledmodules);
+    for (moduleentry_t *module = availmodules; module->name != NULL; module++)
+        dynarr_push(config.enabledmodules, module);
+    crawler(argv[1], &config);
+    dynarr_destroy(config.enabledmodules);
+    curl_global_cleanup();
+
+    return 0;
+}
diff --git a/src/mod_debug.c b/src/mod_debug.c
new file mode 100644
index 0000000..5c90180
--- /dev/null
+++ b/src/mod_debug.c
@@ -0,0 +1,56 @@
+#include <tidy.h>
+#include <tidybuffio.h>
+
+#include "util.h"
+#include "module.h"
+
+/* Traverse the document tree */
+void dumpNode(TidyDoc doc, TidyNode tnod, int indent)
+{
+  TidyNode child;
+  for(child = tidyGetChild(tnod); child; child = tidyGetNext(child) ) {
+    ctmbstr name = tidyNodeGetName(child);
+    if(name) {
+      /* if it has a name, then it's an HTML tag ... */
+      TidyAttr attr;
+      fprintf(stderr, "%*.*s%s ", indent, indent, "<", name);
+      /* walk the attribute list */
+      for(attr = tidyAttrFirst(child); attr; attr = tidyAttrNext(attr) ) {
+        fprintf(stderr, "%s", tidyAttrName(attr));
+        tidyAttrValue(attr)?fprintf(stderr, "=\"%s\" ",
+                                   tidyAttrValue(attr)):fprintf(stderr, " ");
+      }
+      fprintf(stderr, ">\n");
+    }
+    else {
+      /* if it does not have a name, then it's probably text, cdata, etc... */
+      TidyBuffer buf;
+      tidyBufInit(&buf);
+      tidyNodeGetText(doc, child, &buf);
+      fprintf(stderr, "%*.*s%s\n", indent, indent, "", buf.bp?(char *)buf.bp:"");
+      tidyBufFree(&buf);
+    }
+    dumpNode(doc, child, indent + 4); /* recursive */
+  }
+}
+ 
+int mod_debug_onpagecomplete(void *userdata, pagecompletedata_t *data) {
+    fprintf(stderr, "\n-- HTML for %s\n\n", data->url);
+    fwrite(data->page, 1, data->npage, stderr);
+    fprintf(stderr, "\n\n-- *CLEANED* HTML for %s\n\n", data->url);
+    TidyDoc doc = searchextradata(EXTRA_TIDY, "tidyDoc", data->extradata.data, data->extradata.len);
+    if (doc == NULL) {
+        error("\"tidyDoc\" entry not found. either mod_tidy failed or is not loaded before");
+        return -1;
+    }
+    dumpNode(doc, tidyGetRoot(doc), 0);
+    return 0;
+}
+
+int mod_debug_init(crawlermodule_t *entry) {
+    *entry = (crawlermodule_t) {
+        .init = entry->init,
+        .onpagecomplete = mod_debug_onpagecomplete,
+    };
+    return 0;
+}
diff --git a/src/mod_pagedata.c b/src/mod_pagedata.c
new file mode 100644
index 0000000..b34022b
--- /dev/null
+++ b/src/mod_pagedata.c
@@ -0,0 +1,149 @@
+#include <curl/curl.h>
+#include <tidybuffio.h>
+#include <tidy.h>
+
+#include "json.h"
+#include "module.h"
+#include "util.h"
+
+#define MAX_HEADER_SIZE 256
+#define MAX_TITLE_SIZE 256
+
+bool getescapedheader(CURL *handle, const char *header, char **escaped) {
+    struct curl_header *data;
+    *escaped = NULL;
+    CURLHcode res = curl_easy_header(handle, header, 0, CURLH_HEADER, -1, &data);
+    if (res != CURLHE_OK) {
+        if (res == CURLHE_BADINDEX || res == CURLHE_NOREQUEST || 
+            res == CURLHE_NOREQUEST || res == CURLHE_MISSING)
+            return true;
+        error("curl_easy_handle() failed with code %d", res);
+        return false;
+    }
+    int len = strlen(data->value);
+    if (len > MAX_HEADER_SIZE) {
+        error("max header size of %d bytes exceeded. header size is %d bytes", MAX_HEADER_SIZE, len);
+        return false;
+    }
+    *escaped = sanitize2ascii_dyn(data->value, MAX_HEADER_SIZE * 4);
+    if (escaped == NULL) {
+        error("sanitize2ascii_dyn() failed");
+        return false;
+    }
+    return true;
+}
+
+char *gettitle(TidyDoc doc, TidyNode node) {
+    for (TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) {
+        ctmbstr name = tidyNodeGetName(child);
+        if (!name)
+            return NULL;
+        if (strcmp(name, "title") == 0) {
+            TidyNode textchild = tidyGetChild(child); // If conforming, should be text
+            TidyBuffer buf;
+            tidyBufInit(&buf);
+            if (!tidyNodeGetText(doc, textchild, &buf)) {
+                tidyBufFree(&buf);
+                continue;
+            }
+            size_t len = strlen((char*)buf.bp);
+            // tidy places a newline at the end of a title, so we have to be careful to get rid 
+            // of it
+            if (len <= 1) {
+                tidyBufFree(&buf);
+                continue;
+            }
+            char *ret = xmalloc(len+1);
+            strcpy(ret, (char*)buf.bp);
+            ret[len-1] = '\0';
+            tidyBufFree(&buf);
+            return ret;
+        }
+        char *ret = gettitle(doc, child);
+        if (ret != NULL)
+            return ret;
+    }
+    return NULL;
+}
+
+void dumpnode(TidyDoc doc, charp_dynarr_t *freearr, jsonkv_dynarr_t *json) {
+    TidyNode head = tidyGetHead(doc);
+    if (head) {
+        char *title = gettitle(doc, head);
+        if (title != NULL) {
+            int len = strlen(title);
+            if (len > MAX_TITLE_SIZE) {
+                error("max title size of %d bytes exceeded. title size is %d bytes", MAX_TITLE_SIZE, len);
+            }
+            else {
+                char *escaped = sanitize2ascii_dyn(title, MAX_TITLE_SIZE * 4);
+                if (escaped == NULL) {
+                    error("sanitize2ascii_dyn() failed");
+                    xfree(escaped);
+                }
+                jsonkv_t kv = { .key = "title", .val = json_createstr(escaped) };
+                dynarr_push(*json, kv);
+                dynarr_push(*freearr, escaped);
+            }
+            xfree(title);
+        }
+    }
+}
+
+int mod_pagedata_onpagecomplete(void *userdata, pagecompletedata_t *data) {
+    jsonkv_dynarr_t *jsonarr = 
+        searchextradata(EXTRA_JSON, "json", data->extradata.data, data->extradata.len);
+    if (jsonarr == NULL) {
+        jsonkv_dynarr_t *arr = xmalloc(sizeof(jsonkv_dynarr_t));
+        dynarr_init(jsonkv_dynarr_t, *arr);
+        extradata_t entry = {
+            .type = EXTRA_JSON,
+            .key = "json",
+            .val = arr,
+        };
+        dynarr_push(data->extradata, entry);
+        jsonarr = (jsonkv_dynarr_t*)dynarr_get(data->extradata, data->extradata.len-1)->val;
+    }
+
+    charp_dynarr_t *freearr = userdata;
+    char *headernames[] = {"Content-Type", "Last-Modified", "ETag"};
+    for (size_t i = 0; i < array_size(headernames); i++) {
+        char *escaped;
+        if (!getescapedheader(data->handle, headernames[i], &escaped)) {
+            error("appendheader() failed with header %s", headernames[i]);
+            continue;
+        }
+        if (escaped == NULL)
+            continue;
+        dynarr_push(*freearr, escaped);
+        jsonkv_t kv = { .key = headernames[i], .val = json_createstr(escaped) };
+        dynarr_push(*jsonarr, kv);
+    }
+
+    TidyDoc doc = searchextradata(EXTRA_TIDY, "tidyDoc", data->extradata.data, data->extradata.len);
+    if (doc == NULL)
+        debug("url %s contained no tidy data, skipping document parsing...", data->url);
+    else
+        dumpnode(doc, freearr, jsonarr);
+    return 0;
+}
+
+int mod_pagedata_destroy(void *userdata) {
+    charp_dynarr_t *freearr = userdata;
+    for (size_t i = 0; i < freearr->len; i++)
+        xfree(freearr->data[i]);
+    dynarr_destroy(*freearr);
+    xfree(freearr);
+    return 0;
+}
+
+int mod_pagedata_init(crawlermodule_t *entry) {
+    *entry = (crawlermodule_t) {
+        .userdata = xmalloc(sizeof(charp_dynarr_t)),
+        .init = entry->init,
+        .destroy = mod_pagedata_destroy,
+        .onpagecomplete = mod_pagedata_onpagecomplete,
+    };
+    *(charp_dynarr_t*)entry->userdata = dynarr_initi(charp_dynarr_t);
+    return 0;
+}
diff --git a/src/mod_parse.c b/src/mod_parse.c
new file mode 100644
index 0000000..9db17c7
--- /dev/null
+++ b/src/mod_parse.c
@@ -0,0 +1,214 @@
+#include <curl/curl.h>
+#include <ctype.h>
+#include <tidybuffio.h>
+#include <tidy.h>
+
+#include "module.h"
+#include "util.h"
+
+#define MAX_LINK_LEN 512
+
+bool islinksafe(char c) {
+    return ('a' <= c && c <= 'z') ||
+        ('A' <= c && c <= 'Z') ||
+        ('0' <= c && c <= '9') ||
+        c == '&' || c == '$' || c == ',' || c == '/' || 
+        c == ':' || c == ';' || c == '=' || c == '?' || 
+        c == '@' || c == '#' || c == '%' || c == '~' ||
+        c == '_' || c == '-' || c == '(' || c == ')' ||
+        c == '.';
+}
+
+char *relative2absolute(CURLU *curl_url_h, const char *parent, const char *relative) {
+    CURLUcode url_res;
+    char *curl_abs_link = NULL;
+    url_res = curl_url_set(curl_url_h, CURLUPART_URL, parent, 0);
+    if (url_res != CURLUE_OK) {
+        char sanitized[100];
+        sanitize2ascii(sanitized, relative, sizeof(sanitized));
+        error("Parent URL parsing failed for \"%s\": %s", 
+                sanitized, curl_url_strerror(url_res));
+        return NULL;
+    }
+    url_res = curl_url_set(curl_url_h, CURLUPART_URL, relative, 0);
+    if (url_res != CURLUE_OK){
+        char sanitized[100];
+        sanitize2ascii(sanitized, relative, sizeof(sanitized));
+        error("Relative URL parsing failed for \"%s\": %s", 
+                sanitized, curl_url_strerror(url_res));
+        return NULL;
+    } 
+    url_res = curl_url_get(curl_url_h, CURLUPART_URL, &curl_abs_link, CURLU_PUNYCODE);
+    if (url_res != CURLUE_OK) {
+        error("Full URL parsing failed: %s", curl_url_strerror(url_res));
+        return NULL;
+    }
+    char *ret = xmalloc(strlen(curl_abs_link) + 1);
+    strcpy(ret, curl_abs_link);
+    curl_free(curl_abs_link);
+    return ret;
+}
+
+size_t geturlcutlen(const char *url, size_t len) {
+    char *hash = memchr(url, '#', len);
+    size_t hlen = hash == NULL ? len : hash - url;
+    char *ques = memchr(url, '?', len);
+    size_t qlen = ques == NULL ? len : ques - url;
+    return min(hlen, qlen);
+}
+
+char *parselink(CURLU *curl_url_h, const char *parent, const char *child, size_t nchild) {
+    size_t cutlen = geturlcutlen(child, nchild);
+    if (cutlen < 1)
+        return NULL;
+    char *linkbuf = xmalloc(cutlen + 1);
+    memcpy(linkbuf, child, cutlen);
+    linkbuf[cutlen] = '\0';
+    char *abslink = relative2absolute(curl_url_h, parent, linkbuf);
+    if (abslink != NULL) {
+        char sanitized[100];
+        sanitize2ascii(sanitized, abslink, sizeof(sanitized));
+        debug("found link: %s", sanitized);
+    }
+    xfree(linkbuf);
+    return abslink;
+}
+
+int parsehrefs(CURLU *curl_url_h, const char *url, const char *page, size_t npage, charp_dynarr_t *ret) {
+    const char *href = "href";
+    size_t href_len = strlen(href), j = 0, linklen = 0;
+    int state = 0, linkcnt = 0;
+    // Probably a really good job for regex but wtvr
+    for (size_t i = 0; i < npage; i++) {
+        if (state == 0) {
+            if (j == href_len) {
+                state += 1;
+                i -= 1;
+            }
+            if (tolower(page[i]) != href[j])
+                j = 0;
+        }
+        else if (state == 1 && page[i] == '=') {
+            state += 1;
+        }
+        else if (state == 2 && page[i] == '"') {
+            state += 1;
+            linklen = 0;
+        }
+        else if (state == 3) {
+            if (page[i] == '"') {
+                state = 0;
+                if (linklen > 0) {
+                    char *link = parselink(curl_url_h, url, page + i - linklen, linklen);
+                    if (link != NULL) {
+                        dynarr_push(*ret, link);
+                        linkcnt++;
+                    }
+                }
+            }
+            if (!islinksafe(page[i]) || linklen >= MAX_LINK_LEN)
+                state = 0;
+            linklen += 1;
+        }
+        else if (page[i] != ' ' && page[i] != '\t' && page[i] != '\n') {
+            state = 0;
+        }
+        j++;
+    }
+    return linkcnt;
+}
+
+char *tolower_s(const char *in) {
+    size_t len = strlen(in);
+    char *ret = xmalloc(len+1);
+    for (size_t i = 0; i < len; i++)
+        ret[i] = tolower(in[i]);
+    ret[len] = '\0';
+    return ret;
+}
+
+int parseattrs(CURLU *curl_url_h, const char *url, TidyNode node, charp_dynarr_t *links) {
+    ctmbstr name = tidyNodeGetName(node);
+    const char *texttags[] = { "p", "t", "span", "a" };
+    char *namelower = tolower_s(name);
+    bool found = false;
+    for (size_t i = 0; !found && i < array_size(texttags); i++)
+        if (strcmp(namelower, texttags[i]) == 0)
+            found = true;
+    xfree(namelower);
+    if (!found)
+        return 0;
+    // Parse attributes for links
+    int linkcnt = 0;
+    for (TidyAttr attr = tidyAttrFirst(node); attr; attr = tidyAttrNext(attr)) {
+        ctmbstr attrname = tidyAttrName(attr);
+        if (attrname == NULL) {
+            char sanitized[100];
+            sanitize2ascii(sanitized, url, sizeof(sanitized));
+            error("empty attr name for url \"%s\"", sanitized);
+            continue;
+        }
+        char *attrnamelower = tolower_s(attrname);
+        ctmbstr val = tidyAttrValue(attr);
+        if (strcmp(attrnamelower, "href") != 0 || val == NULL) {
+            xfree(attrnamelower);
+            continue;
+        }
+        xfree(attrnamelower);
+        // extract the link
+        char *link = parselink(curl_url_h, url, val, strlen(val));
+        if (link == NULL)
+            continue;
+        dynarr_push(*links, link);
+        linkcnt++;
+    }
+    return linkcnt;
+}
+
+int parsenode(CURLU *curl_url_h, const char *url, TidyNode node, charp_dynarr_t *links) {
+    int linkcnt = 0;
+    for (TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) {
+        ctmbstr name = tidyNodeGetName(child);
+        if (name == NULL) {
+            // Node is probably text, for now do nothing
+            // TODO: Parse plain text links
+        }
+        else {
+            linkcnt += parseattrs(curl_url_h, url, child, links);
+        }
+        linkcnt += parsenode(curl_url_h, url, child, links);
+    }
+    return linkcnt;
+}
+
+int mod_parse_onpagecomplete(void *userdata, pagecompletedata_t *data) {
+    CURLU *curl_url_h = userdata;
+    TidyDoc doc = searchextradata(EXTRA_TIDY, "tidyDoc", data->extradata.data, data->extradata.len);
+    if (doc == NULL) {
+        debug("No tidied document found. Parsing hrefs...");
+        parsehrefs(curl_url_h, data->url, data->page, data->npage, data->parsedlinks);
+    } 
+    else {
+        TidyNode body = tidyGetBody(doc);
+        if (!body)
+            return 0;
+        parsenode(curl_url_h, data->url, body, data->parsedlinks);
+    }
+    return 0;
+}
+
+int mod_parse_destroy(void *userdata) {
+    CURLU *curl_url_h = userdata;
+    curl_url_cleanup(curl_url_h);
+    return 0;
+}
+
+int mod_parse_init(crawlermodule_t *entry) {
+    *entry = (crawlermodule_t) {
+        .init = entry->init,
+        .onpagecomplete = mod_parse_onpagecomplete,
+        .destroy = mod_parse_destroy,
+        .userdata = curl_url(),
+    };
+    return 0;
+}
diff --git a/src/mod_robots.c b/src/mod_robots.c
new file mode 100644
index 0000000..d105ad8
--- /dev/null
+++ b/src/mod_robots.c
@@ -0,0 +1,235 @@
+#include <string.h>
+#include <stdlib.h>
+#include <curl/curl.h>
+#include <time.h>
+#include <errno.h>
+
+#include "module.h"
+#include "util.h"
+
+typedef struct hostdata {
+    char *host;
+    time_t crawldelay_ms;
+    charp_dynarr_t prefixes;
+    struct hostdata *next;
+    struct timespec lastcrawled;
+} hostdata_t;
+
+typedef struct {
+    hostdata_t *rules;
+    CURLU *curl_url_h;
+} state_t;
+
+bool isprefixed(const char *str, const char *prefix) {
+    size_t prefixlen = strlen(prefix);
+    return prefixlen > strlen(str) ? false : memcmp(str, prefix, prefixlen) == 0;
+}
+
+void parse_robots_txt(hostdata_t *rules, char *page) {
+    for (char *ptr = page; *ptr != '\0'; ptr++)
+        if (*ptr == '\t')
+            *ptr = ' ';
+    char *linesave, *line;
+    bool forus = true;
+    // page is guarenteed to be terminated by a null byte
+    while ((line = strtok_r(page, "\n", &linesave)) != NULL) {
+        page = NULL;
+        char *comment;
+        if ((comment = memchr(line, '#', strlen(line))) != NULL)
+            *comment = '\0';
+        char *wssave, *func;
+        if ((func = strtok_r(line, " ", &wssave)) == NULL)
+            continue;
+        char *tmp = xmalloc(strlen(func)+1);
+        strcpy(tmp, func);
+        func = tmp;
+        char *arg;
+        if ((arg = strtok_r(NULL, " ", &wssave)) == NULL)
+            continue;
+
+        if (strcmp(func, "User-agent:") == 0) {
+            forus = strcmp(arg, "*") == 0 || isprefixed(arg, "AvaBot");
+        }
+        else if (strcmp(func, "Disallow:") == 0) {
+            if (!forus) {
+                xfree(func);
+                continue;
+            }
+            if (strcmp(arg, "*") == 0) {
+                dynarr_push(rules->prefixes, "/");
+            }
+            else {
+                char *buf = xmalloc(strlen(arg)+1);
+                strcpy(buf, arg);
+                dynarr_push(rules->prefixes, buf);
+            }
+        }
+        else if (strcmp(func, "Crawl-delay:") == 0) {
+            if (!forus) {
+                xfree(func);
+                continue;
+            }
+            rules->crawldelay_ms = (unsigned long)(1000.0*atof(arg))/1000;
+        }
+        xfree(func);
+    }
+}
+
+void robots_txt_cb(void *cbdata, const char *url, char *page, size_t npage, CURL *handle) {
+    state_t *state = cbdata;
+    hostdata_t *rules = state->rules;
+    dynarr_init(charp_dynarr_t, rules->prefixes);
+    rules->crawldelay_ms = 100;
+    char *curl_path;
+    CURLUcode rc;
+    if ((rc = curl_url_set(state->curl_url_h, CURLUPART_URL, url, 0)) != CURLUE_OK ||
+        (rc = curl_url_get(state->curl_url_h, CURLUPART_PATH, &curl_path, 0)) != CURLUE_OK) {
+        error("curl_url operation failed: %s (code %d)", curl_url_strerror(rc), rc);
+        return;
+    }
+    char *ctype;
+    CURLcode ec;
+    if ((ec = curl_easy_getinfo(handle, CURLINFO_CONTENT_TYPE, &ctype)) != CURLE_OK) {
+        error("curl_easy_getinfo failed: %s (code %d)", curl_easy_strerror(ec), ec);
+        return;
+    }
+    bool ctypegood = ctype == NULL ? false : isprefixed(ctype, "text/plain");
+    if (!ctypegood || page == NULL) {
+        if (strcmp(curl_path, "robots.txt") == 0) {
+            char *curl_newurl;
+            const char *newpath = ".well-known/robots.txt";
+            if ((rc = curl_url_set(state->curl_url_h, CURLUPART_PATH, newpath, 0)) != CURLUE_OK ||
+                (rc = curl_url_get(state->curl_url_h, CURLUPART_URL, &curl_newurl, 0)) != CURLUE_OK) {
+                error("curl_url operation failed: %s (code %d)", curl_url_strerror(rc), rc);
+                curl_free(curl_path);
+                return;
+            }
+            char *newurl = xmalloc(strlen(curl_newurl)+1);
+            strcpy(newurl, curl_newurl);
+            curl_free(curl_newurl);
+            makerequest(newurl, robots_txt_cb, cbdata);
+        }
+        return;
+    }
+    parse_robots_txt(rules, page);
+    xfree(state); // state is cb owned
+}
+
+filterres_t mod_robots_filter(void *userdata, const char *url) {
+    state_t *state = userdata;
+    char *curl_host, *curl_path;
+    CURLUcode rc;
+    if ((rc = curl_url_set(state->curl_url_h, CURLUPART_URL, url, 0) != CURLUE_OK) ||
+        (rc = curl_url_get(state->curl_url_h, CURLUPART_HOST, &curl_host, 0) != CURLUE_OK) ||
+        (rc = curl_url_get(state->curl_url_h, CURLUPART_PATH, &curl_path, 0) != CURLUE_OK)) {
+        char sanitized[100];
+        sanitize2ascii(sanitized, url, sizeof(sanitized));
+        error("curl_url operation failed for \"%s\": %s (code %d)", sanitized, 
+                curl_url_strerror(rc), rc);
+        return FILTER_PASS;
+    }
+    if (curl_host == NULL) {
+        error("curl_host == NULL");
+        return FILTER_PASS;
+    }
+    hostdata_t *rules = state->rules, *prev = NULL;
+    for (; rules != NULL &&
+            strcmp(rules->host, curl_host) != 0;
+            prev = rules, rules = rules->next)
+        ;
+    filterres_t ret;
+    if (rules == NULL) {
+        char *curl_url;
+        if ((rc = curl_url_set(state->curl_url_h, CURLUPART_PATH, "robots.txt", 0) != CURLUE_OK) ||
+            (rc = curl_url_get(state->curl_url_h, CURLUPART_URL, &curl_url, 0) != CURLUE_OK)) {
+            char sanitized[100];
+            sanitize2ascii(sanitized, url, sizeof(sanitized));
+            error("curl_url operation failed for \"%s\": %s (code %d)", sanitized, 
+                    curl_url_strerror(rc), rc);
+            ret = FILTER_PASS;
+            goto cleanup;
+        }
+        if (curl_url == NULL) {
+            error("curl_url == NULL");
+            ret = FILTER_PASS;
+            goto cleanup;
+        }
+        char *host = xmalloc(strlen(curl_host)+1);
+        strcpy(host, curl_host);
+        hostdata_t *newrules = xmalloc(sizeof(hostdata_t));
+        *newrules = (hostdata_t) { .host = host, 0 };
+        if (prev == NULL)
+            state->rules = newrules;
+        else
+            prev->next = newrules;
+        char *url = xmalloc(strlen(curl_url)+1);
+        strcpy(url, curl_url);
+        state_t *cbdata = xmalloc(sizeof(state_t));
+        cbdata->rules = newrules;
+        cbdata->curl_url_h = state->curl_url_h;
+        debug("cbdata = %p", cbdata);
+        makerequest(url, robots_txt_cb, cbdata);
+        curl_free(curl_url);
+        ret = FILTER_STALL;
+    }
+    else {
+        if (rules->prefixes.data == NULL) { 
+            // robots.txt request hasn't finished
+            ret = FILTER_STALL;
+            goto cleanup;
+        }
+        struct timespec now;
+        if (clock_gettime(CLOCK_MONOTONIC, &now) < 0) {
+            char *err = strerror(errno);
+            error("clock_gettime failed: %s (code %d)", err, errno);
+            ret = FILTER_STALL;
+            goto cleanup;
+        }
+        time_t diff_ms = now.tv_sec * 1000 + now.tv_nsec / 1000000 - 
+            rules->lastcrawled.tv_sec * 1000 - rules->lastcrawled.tv_nsec / 1000000;
+        if (diff_ms < rules->crawldelay_ms) {
+            ret = FILTER_STALL;
+            goto cleanup;
+        }
+        rules->lastcrawled = now;
+        ret = FILTER_PASS;
+        for (size_t i = 0; i < rules->prefixes.len; i++) {
+            if (isprefixed(curl_path, rules->prefixes.data[i])) {
+                ret = FILTER_REJECT;
+                break;
+            }
+        }
+    }
+cleanup:
+    curl_free(curl_path);
+    curl_free(curl_host);
+    return ret;
+}
+
+int mod_robots_destroy(void *userdata) {
+    state_t *state = userdata;
+    curl_url_cleanup(state->curl_url_h);
+    hostdata_t *cur = state->rules, *next;
+    for (; cur != NULL; cur = next) {
+        next = cur->next;
+        dynarr_destroy(cur->prefixes);
+        xfree(cur);
+    }
+    xfree(state);
+    return 0;
+}
+
+int mod_robots_init(crawlermodule_t *entry) {
+    state_t *state = xmalloc(sizeof(state_t));
+    *state = (state_t) {
+        .rules = NULL,
+        .curl_url_h = curl_url(),
+    };
+    *entry = (crawlermodule_t) {
+        .userdata = state,
+        .init = entry->init,
+        .filter = mod_robots_filter,
+        .destroy = mod_robots_destroy,
+    };
+    return 0;
+}
diff --git a/src/mod_tidy.c b/src/mod_tidy.c
new file mode 100644
index 0000000..8d828b6
--- /dev/null
+++ b/src/mod_tidy.c
@@ -0,0 +1,71 @@
+#include <tidy.h>
+#include <tidybuffio.h>
+#include "util.h"
+#include "module.h"
+
+int repairdoc(TidyDoc doc, TidyBuffer *docbuf) {
+    int rc;
+    rc = tidyParseBuffer(doc, docbuf);
+    if (rc < 0) {
+        error("tidyParseBuffer() returned code %d", rc);
+        return rc;
+    }
+    rc = tidyCleanAndRepair(doc);
+    if (rc < 0) {
+        error("tidyCleanAndRepair() returned code %d", rc);
+        return rc;
+    }
+    rc = tidyRunDiagnostics(doc);
+    if (rc < 0)
+        error("tidyRunDiagnostics() returned code %d", rc);
+    return rc;
+}
+
+int mod_tidy_onpagecomplete(void *userdata, pagecompletedata_t *data) {
+    if (searchextradata(EXTRA_TIDY, "tidyDoc", data->extradata.data, data->extradata.len) != NULL)
+        return -1;
+
+    TidyBuffer origbuf = {0}, errbuf = {0};
+    tidyBufAttach(&origbuf, (byte*)data->page, data->npage);
+
+    TidyDoc tdoc = tidyCreate();
+    tidyOptSetBool(tdoc, TidyForceOutput, yes);
+    tidyOptSetInt(tdoc, TidyWrapLen, 4096);
+    tidySetErrorBuffer(tdoc, &errbuf);
+
+    int rc;
+    rc = repairdoc(tdoc, &origbuf);
+    if (rc < 0)
+        goto cleanup;
+
+    debug("encountered errors in doc: %s", errbuf.bp);
+
+    extradata_t entry = {
+        .type = EXTRA_TIDY,
+        .key = "tidyDoc",
+        .val = (void*)tdoc,
+    };
+    dynarr_push(data->extradata, entry);
+cleanup:
+    tidyBufFree(&errbuf);
+    // TODO: Safety is unknown
+    // tdoc still exists while corresponding tidyBuffer is detached
+    tidyBufDetach(&origbuf); 
+    return rc >= 0 ? 0 : rc;
+}
+
+int mod_tidy_onpagedestroy(void *userdata, pagecompletedata_t *data) {
+    TidyDoc doc = searchextradata(EXTRA_TIDY, "tidyDoc", data->extradata.data, data->extradata.len);
+    if (doc != NULL)
+        tidyRelease(doc);
+    return 0;
+}
+
+int mod_tidy_init(crawlermodule_t *entry) {
+    *entry = (crawlermodule_t) {
+        .init = entry->init,
+        .onpagecomplete = mod_tidy_onpagecomplete,
+        .onpagedestroy = mod_tidy_onpagedestroy,
+    };
+    return 0;
+}
diff --git a/src/module.c b/src/module.c
new file mode 100644
index 0000000..d3ba42c
--- /dev/null
+++ b/src/module.c
@@ -0,0 +1,28 @@
+#include <string.h>
+
+#include "module.h"
+
+moduleentry_t availmodules[] = {
+    (moduleentry_t) { .name = "mod_tidy", .module = { .init = mod_tidy_init } },
+    //(moduleentry_t) { .name = "mod_debug", .module = { .init = mod_debug_init } },
+    (moduleentry_t) { .name = "mod_pagedata", .module = { .init = mod_pagedata_init } },
+    (moduleentry_t) { .name = "mod_parse", .module = { .init = mod_parse_init } },
+    (moduleentry_t) { .name = "mod_robots", .module = { .init = mod_robots_init } },
+    { 0 },
+};
+
+requestedreq_dyanrr_t requestedreqs = { 0 };
+
+void *searchextradata(extradata_type_t type, char *key, extradata_t *data, size_t ndata) {
+    for (size_t i = 0; i < ndata; i++)
+        if (data[i].type == type && strcmp(data[i].key, key) == 0)
+            return data[i].val;
+    return NULL;
+}
+
+void makerequest(const char *url, reqcb_t cb, void *userdata) {
+    if (requestedreqs.data == NULL)
+        dynarr_init(requestedreq_dyanrr_t, requestedreqs);
+    requestedreq_t request = { .url = url, .cb = cb, .userdata = userdata, 0 };
+    dynarr_push(requestedreqs, request);
+}
diff --git a/src/util.c b/src/util.c
new file mode 100644
index 0000000..55e3d88
--- /dev/null
+++ b/src/util.c
@@ -0,0 +1,228 @@
+#include <stdlib.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <stdbool.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <stdint.h>
+
+#include "util.h"
+
+const char *last2path(const char *path) {
+    const char *prev2 = NULL, *prev = NULL, *cur = path;
+    for(;;) {
+        prev2 = prev;
+        prev = cur;
+        cur = memchr(cur, '/', strlen(cur));
+        if (cur++ == NULL)
+            break;
+    }
+    if (prev2 == NULL)
+        return path;
+    return prev2;
+}
+
+void volog(loglevel_t level, const char *file, int line, const char *fmt, va_list ap) {
+    char *cc;
+    switch (level) {
+    case LEVEL_DEBUG:
+        cc = "\x1b[0mDEBUG";
+        break;
+    case LEVEL_INFO:
+        cc = "\x1b[0mINFO";
+        break;
+    case LEVEL_WARN:
+        cc = "\x1b[1;35mWARN";
+        break;
+    case LEVEL_ERROR:
+        cc = "\x1b[1;31mERROR";
+        break;
+    case LEVEL_FATAL:
+        cc = "\x1b[38;5;124mFATAL";
+        break;
+    }
+    if (file == NULL)
+        fprintf(stderr, "%s (unknown file): ", cc);
+    else
+        fprintf(stderr, "%s (%s:%d): ", cc, last2path(file), line);
+    vfprintf(stderr, fmt, ap);
+    fprintf(stderr, "\x1b[0m\n");
+}
+
+void olog(loglevel_t level, const char *file, int line, const char *fmt, ...) {
+    va_list ap;
+    va_start(ap, fmt);
+    volog(level, file, line, fmt, ap);
+    va_end(ap);
+}
+
+void die(const char *fmt, ...) {
+    va_list ap;
+    va_start(ap, fmt);
+    volog(LEVEL_FATAL, NULL, 0, fmt, ap);
+    va_end(ap);
+    exit(1);
+}
+
+void *xmalloc(size_t size) {
+    void *ret = malloc(size);
+    if (ret == NULL)
+        die("xmalloc failed: %s", strerror(errno));
+    return ret;
+}
+
+void *xcalloc(size_t nmemb, size_t size) {
+    void *ret = calloc(nmemb, size);
+    if (ret == NULL)
+        die("xcalloc failed: %s", strerror(errno));
+    return ret;
+}
+
+void *xrealloc(void *ptr, size_t size) {
+    void *old = ptr;
+    ptr = realloc(ptr, size);
+    if (ptr == NULL)
+        die("xrealloc failed: %s", strerror(errno));
+    return ptr;
+}
+
+void xfree(void *ptr) {
+    free(ptr);
+}
+
+size_t parityhash(const void *data, size_t ndata) {
+    size_t sum = 0;
+    for (size_t i = 0; i < ndata; i++)
+        sum += *((unsigned char*)data + i);
+    return sum;
+}
+
+char *sanitize2ascii_dyn(const char *inp, size_t maxlen) {
+    size_t nout = strlen(inp), out_ind = 0;
+    if (nout > maxlen)
+        return NULL;
+    char *out = xmalloc(nout);
+    char tmp[5];
+    size_t inp_ind;
+    for (inp_ind = 0; inp[inp_ind] != '\0'; inp_ind++) {
+        if (inp[inp_ind] < 0x20 || inp[inp_ind] > 0x7e) {
+            if (out_ind + 5 >= maxlen)
+                break;
+            if (out_ind + 5 >= nout) {
+                nout *= 2;
+                out = xrealloc(out, nout);
+            }
+            snprintf(tmp, sizeof(tmp), "\\x%02x", inp[inp_ind]);
+            memcpy(out + out_ind, tmp, 4);
+            out_ind += 4; // needs to accomidate "\xXX"
+        }
+        else if (inp[inp_ind] == '\\' || inp[inp_ind] == '"') {
+            if (out_ind + 3 >= maxlen)
+                break;
+            if (out_ind + 3 >= nout) {
+                nout *= 2;
+                out = xrealloc(out, nout);
+            }
+            out[out_ind++] = '\\';
+            out[out_ind++] = inp[inp_ind];
+        }
+        else {
+            if (out_ind + 2 >= maxlen)
+                break;
+            if (out_ind + 2 >= nout) {
+                nout *= 2;
+                out = xrealloc(out, nout);
+            }
+            out[out_ind++] = inp[inp_ind];
+        }
+    }
+    if (inp[inp_ind] != '\0') {
+        xfree(out);
+        return NULL;
+    }
+    out[out_ind] = '\0';
+    return out;
+}
+
+size_t sanitize2ascii(char *out, const char *inp, size_t outsize) {
+    size_t out_ind = 0;
+    char tmp[5];
+    for (size_t inp_ind = 0; inp[inp_ind] != '\0'; inp_ind++) {
+        if (inp[inp_ind] < 0x20 || inp[inp_ind] > 0x7e) {
+            if (out_ind + 5 >= outsize)
+                break;
+            snprintf(tmp, sizeof(tmp), "\\x%02x", inp[inp_ind]);
+            memcpy(out + out_ind, tmp, 4);
+            out_ind += 4; // needs to accomidate "\xXX"
+        }
+        else if (inp[inp_ind] == '\\' || inp[inp_ind] == '"') {
+            if (out_ind + 3 >= outsize)
+                break;
+            out[out_ind++] = '\\';
+            out[out_ind++] = inp[inp_ind];
+        }
+        else {
+            if (out_ind + 2 >= outsize)
+                break;
+            out[out_ind++] = inp[inp_ind];
+        }
+    }
+    out[out_ind] = '\0';
+    return out_ind;
+}
+
+// This is the sample c implementation of MurmurHash taken from Wikipedia. Credit to the Wikipedia article 
+// of MurmurHash and whoever made the sample implementation on the page. 
+// https://en.wikipedia.org/w/index.php?title=MurmurHash&oldid=1218923262 accessed on 2024-06-02T18+00:00
+// ----- BEGIN WIKIPEDIA SAMPLE CODE -----
+static inline uint32_t murmur_32_scramble(uint32_t k) {
+    k *= 0xcc9e2d51;
+    k = (k << 15) | (k >> 17);
+    k *= 0x1b873593;
+    return k;
+}
+uint32_t murmur3_32(const uint8_t* key, size_t len, uint32_t seed)
+{
+	uint32_t h = seed;
+    uint32_t k;
+    /* Read in groups of 4. */
+    for (size_t i = len >> 2; i; i--) {
+        // Here is a source of differing results across endiannesses.
+        // A swap here has no effects on hash properties though.
+        memcpy(&k, key, sizeof(uint32_t));
+        key += sizeof(uint32_t);
+        h ^= murmur_32_scramble(k);
+        h = (h << 13) | (h >> 19);
+        h = h * 5 + 0xe6546b64;
+    }
+    /* Read the rest. */
+    k = 0;
+    for (size_t i = len & 3; i; i--) {
+        k <<= 8;
+        k |= key[i - 1];
+    }
+    // A swap is *not* necessary here because the preceding loop already
+    // places the low bytes in the low places according to whatever endianness
+    // we use. Swaps only apply when the memory is copied in a chunk.
+    h ^= murmur_32_scramble(k);
+    /* Finalize. */
+	h ^= len;
+	h ^= h >> 16;
+	h *= 0x85ebca6b;
+	h ^= h >> 13;
+	h *= 0xc2b2ae35;
+	h ^= h >> 16;
+	return h;
+}
+// ----- END WIKIPEDIA SAMPLE CODE -----
+
+bool hset_charp_cmp(char **lhs, char **rhs) {
+    return strcmp(*lhs, *rhs) == 0;
+}
+
+size_t hset_charp_hash(char **str) {
+    // u32 -> u64?
+    return (size_t)murmur3_32(*str, strlen(*str), 0x9747b28c);
+}
diff --git a/tests/deque_pop.c b/tests/deque_pop.c
new file mode 100644
index 0000000..6a0d2be
--- /dev/null
+++ b/tests/deque_pop.c
@@ -0,0 +1,45 @@
+#include <stdlib.h>
+
+#include "util.h"
+#include "unit.h"
+
+#define TEST_LEN 32768
+#define SIZE_MAX (size_t)-1
+
+int tests_deque_pop(int argc, char **argv) {
+    int *expected_front = xmalloc(TEST_LEN / 2 * sizeof(int));
+    int *expected_back = xmalloc(TEST_LEN / 2 * sizeof(int));
+    for (size_t i = 0; i < TEST_LEN / 2; i++)
+        expected_front[i] = rand();
+    for (size_t i = 0; i < TEST_LEN / 2; i++)
+        expected_back[i] = rand();
+    int_deque_t deq;
+    deque_init(int_deque_t, deq);
+    for (size_t i = 0; i < TEST_LEN / 2; i++) {
+        if (rand() % 2 == 0) {
+            deque_push_back(deq, expected_back[i]);
+            deque_push_front(deq, expected_front[TEST_LEN / 2 - i - 1]);
+        } 
+        else {
+            deque_push_front(deq, expected_front[TEST_LEN / 2 - i - 1]);
+            deque_push_back(deq, expected_back[i]);
+        }
+    }
+    int_deque_t from_back, from_front;
+    deque_clone(from_back, deq);
+    deque_clone(from_front, deq);
+    deque_destroy(deq);
+    for (size_t i = 0; i < 2 * (TEST_LEN / 2); i++) {
+        int exp = (i >= TEST_LEN / 2) ? expected_back[i - TEST_LEN / 2] : expected_front[i];
+        chi_assert("from_front value doesnt match", deque_pop_front(from_front) == exp);
+    }
+    for (size_t i = 2 * (TEST_LEN / 2) - 1; i != SIZE_MAX; i--) {
+        int exp = (i >= TEST_LEN / 2) ? expected_back[i - TEST_LEN / 2] : expected_front[i];
+        chi_assert("from_back value doesnt match", deque_pop_back(from_back) == exp);
+    }
+    deque_destroy(from_front);
+    deque_destroy(from_back);
+    xfree(expected_back);
+    xfree(expected_front);
+    return 0;
+}
diff --git a/tests/deque_push.c b/tests/deque_push.c
new file mode 100644
index 0000000..70f66ba
--- /dev/null
+++ b/tests/deque_push.c
@@ -0,0 +1,58 @@
+#include "util.h"
+#include "unit.h"
+
+#define SIZE_MAX (size_t)-1
+
+int tests_deque_push(int argc, char **argv) {
+    int_deque_t deq;
+
+    // push back and front
+    deque_init(int_deque_t, deq);
+    deque_push_back(deq, 10);
+    chi_assert("incorrect back value", deq.base[0] == 10);
+    deque_push_front(deq, 69);
+    chi_assert("incorrect back value", deq.base[0] == 10);
+    chi_assert("incorrect front value", deq.base[deq.cap-1] == 69);
+    for(size_t i = 0; i < DEQUE_INIT_CAP - 2; i++)
+        deque_push_front(deq, i);
+    chi_assert("length and/or capacity incorrect", deq.len == deq.cap && deq.cap == DEQUE_INIT_CAP);
+    chi_assert("incorrect back value", deq.base[0] == 10);
+    chi_assert("incorrect front value ", deq.base[deq.cap-1] == 69);
+    chi_assert("incorrect back", deq.back == 1);
+    chi_assert("incorrect front", deq.front == 1);
+    for (size_t i = 2; i < deq.cap - 1; i++)
+        chi_assert("incorrect ordering", deq.base[i-1] > deq.base[i]);
+    deque_push_back(deq, 0xee);
+    chi_assert("length and/or capacity incorrect correct", 
+            deq.len == DEQUE_INIT_CAP + 1 && deq.cap == DEQUE_INIT_CAP * 2);
+    chi_assert("incorrect back value 1", deq.base[deq.len - 1] == 0xee);
+    chi_assert("incorrect back value 2", deq.base[deq.len - 2] == 10);
+    chi_assert("incorrect front value", deq.base[deq.len - 3] == 69);
+    chi_assert("incorrect back", deq.back == DEQUE_INIT_CAP + 1);
+    chi_assert("incorrect front", deq.front == 0);
+    for (size_t i = 1; i < deq.len - 3; i++)
+        chi_assert("incorrect ordering", deq.base[i-1] > deq.base[i]);
+    deque_destroy(deq);
+
+    // push front
+    deque_init(int_deque_t, deq);
+    for (size_t i = 4 * DEQUE_INIT_CAP - 1; i != SIZE_MAX; i--)
+        deque_push_front(deq, i);
+    chi_assert("incorrect back and/or front", deq.back == deq.front && deq.back == DEQUE_INIT_CAP * 2);
+    chi_assert("incorrect length and/or capcity", deq.len == deq.cap && deq.cap == 4 * DEQUE_INIT_CAP);
+    for (size_t i = 0; i < deq.cap; i++)
+        chi_assert("incorrect value", *deque_get(deq, i) == i);
+    deque_destroy(deq);
+
+    // push back
+    deque_init(int_deque_t, deq);
+    for (size_t i = 0; i < 4 * DEQUE_INIT_CAP; i++)
+        deque_push_back(deq, i);
+    chi_assert("incorrect back and/or front", deq.back == deq.front && deq.back == 0);
+    chi_assert("incorrect length and/or capcity", deq.len == deq.cap && deq.cap == 4 * DEQUE_INIT_CAP);
+    for (size_t i = 0; i < deq.cap; i++)
+        chi_assert("incorrect value", *deque_get(deq, i) == i);
+    deque_destroy(deq);
+
+    return 0;
+}
diff --git a/tests/dynarr.c.old b/tests/dynarr.c.old
new file mode 100644
index 0000000..ed0018e
--- /dev/null
+++ b/tests/dynarr.c.old
@@ -0,0 +1,133 @@
+#include <string.h>
+
+#include "util.h"
+#include "unit.h"
+
+#define TEST_LEN 32768
+
+void get_test(void) {
+    int_dynarr_t arr = DYNARR_INIT(int_dynarr_t);
+    for (int i = 0; i < 100; i++)
+        DYNARR_PUSH(arr, 0);
+    chi_assert("get(5) == arr.data + 5", DYNARR_GET(arr, 5) == arr.data + 5);
+}
+
+void extensions_test(void) {
+    int *cmp = xmalloc(TEST_LEN * sizeof(int));
+    for (size_t i = 0; i < TEST_LEN; i++)
+        cmp[i] = rand();
+
+    int_dynarr_t extend = DYNARR_INIT(int_dynarr_t);
+    DYNARR_EXTEND_FIXED(extend, cmp, TEST_LEN);
+    chi_assert("extend.data != cmp", memcmp(extend.data, cmp, TEST_LEN * sizeof(int)) == 0);
+    chi_assert("extend.len != TEST_LEN", extend.len == TEST_LEN);
+    DYNARR_DEINIT(extend);
+
+    int_dynarr_t push = DYNARR_INIT(int_dynarr_t);
+    for (size_t i = 0; i < TEST_LEN; i++)
+        DYNARR_PUSH(push, cmp[i]);
+    chi_assert("push.data != cmp", memcmp(push.data, cmp, TEST_LEN * sizeof(int)) == 0);
+    chi_assert("push.len != TEST_LEN", push.len == TEST_LEN);
+    DYNARR_DEINIT(push);
+
+    int_dynarr_t both = DYNARR_INIT(int_dynarr_t);
+    DYNARR_EXTEND_FIXED(both, cmp, TEST_LEN / 2);
+    for (size_t i = TEST_LEN / 2; i < TEST_LEN; i++)
+        DYNARR_PUSH(both, cmp[i]);
+    chi_assert("both.data != cmp", memcmp(both.data, cmp, TEST_LEN * sizeof(int)) == 0);
+    chi_assert("both.len != TEST_LEN", both.len == TEST_LEN);
+    DYNARR_DEINIT(both);
+
+    xfree(cmp);
+}
+
+void insert_test(void) {
+    size_t_dynarr_t increm = DYNARR_INIT(size_t_dynarr_t);
+
+    for (size_t i = 0; i < TEST_LEN; i += 2)
+        DYNARR_PUSH(increm, i);
+    for (size_t i = 1; i < TEST_LEN; i += 2)
+        DYNARR_INSERT(increm, i, i);
+    chi_assert("arr.len == TEST_LEN", increm.len == TEST_LEN);
+    for (size_t i = 0; i < TEST_LEN; i++)
+        chi_assert("arr[i] == i", *DYNARR_GET(increm, i) == i);
+
+    DYNARR_DEINIT(increm);
+
+    int_dynarr_t randins = DYNARR_INIT(int_dynarr_t);
+
+    long long parity = 0, check = 0;
+    
+    for (size_t i = 0; i < TEST_LEN; i++) {
+        int gen = rand() % 10;
+        parity += gen;
+        DYNARR_INSERT(randins, (rand() % (randins.len + 1)), gen);
+    }
+    chi_assert("arr.len == TEST_LEN", randins.len == TEST_LEN);
+    for (size_t i = 0; i < TEST_LEN; i++)
+        check += *DYNARR_GET(randins, i);
+    chi_assert("parity == check", parity == check);
+
+    DYNARR_DEINIT(randins);
+}
+
+void check_arr(int check[], size_t check_len) {
+    int_dynarr_t dyn = DYNARR_INIT(int_dynarr_t);
+
+    DYNARR_EXTEND_FIXED(dyn, check, check_len);
+    chi_assert("dyn.len == check_len", dyn.len == check_len);
+    chi_assert("dyn.data == check", memcmp(dyn.data, check, check_len * sizeof(int)) == 0);
+
+    for (size_t i = 0; i < dyn.len; i++) {
+        size_t ind = rand() % check_len;
+        DYNARR_REMOVE(dyn, ind);
+        memmove(check + ind, check + ind + 1, (check_len - ind - 1) * sizeof(int));
+        check_len -= 1;
+        chi_assert("dyn.len == check_len (modified)", dyn.len == check_len);
+        chi_assert("dyn.data == check (modified)", memcmp(dyn.data, check, check_len * sizeof(int)) == 0);
+    }
+}
+
+void remove_test(void) {
+    int_dynarr_t randdel = DYNARR_INIT(int_dynarr_t);
+
+    long long parity = 0;
+    
+    for (size_t i = 0; i < TEST_LEN; i++) {
+        int gen = rand() % 10;
+        parity += gen;
+        DYNARR_PUSH(randdel, gen);
+    }
+    long long check = parity;
+    for (size_t i = 0; i < TEST_LEN; i++) {
+        size_t ind = rand() % randdel.len;
+        check -= *DYNARR_GET(randdel, ind);
+        DYNARR_REMOVE(randdel, ind);
+        chi_assert("randdel.len == TEST_LEN - i - 1", randdel.len == TEST_LEN - i - 1);
+    }
+    chi_assert("check == 0", check == 0);
+
+    int *c = xmalloc(TEST_LEN * sizeof(int));
+    for (int i = 0; i < TEST_LEN; i++)
+        c[i] = rand();
+    check_arr(c, TEST_LEN);
+    xfree(c);
+
+    DYNARR_DEINIT(randdel);
+}
+
+int main(int argc, char **argv) {
+    if (argc != 2)
+        return 1;
+    if (strcmp(argv[1], "push") == 0) {
+        push_test();
+    } else if (strcmp(argv[1], "get") == 0) {
+        get_test();
+    } else if (strcmp(argv[1], "extensions") == 0) {
+        extensions_test();
+    } else if (strcmp(argv[1], "insert") == 0) {
+        insert_test();
+    } else if (strcmp(argv[1], "remove") == 0) {
+        remove_test();
+    }
+}
diff --git a/tests/dynarr_extensions.c b/tests/dynarr_extensions.c
new file mode 100644
index 0000000..55ca3d7
--- /dev/null
+++ b/tests/dynarr_extensions.c
@@ -0,0 +1,34 @@
+#include "util.h"
+#include "unit.h"
+
+#define TEST_LEN 32768
+
+int tests_dynarr_extensions(int argc, char **argv) {
+    int *cmp = xmalloc(TEST_LEN * sizeof(int));
+    for (size_t i = 0; i < TEST_LEN; i++)
+        cmp[i] = rand();
+
+    int_dynarr_t extend = dynarr_initi(int_dynarr_t);
+    dynarr_extend_fixed(extend, cmp, TEST_LEN);
+    chi_assert("extend.data != cmp", memcmp(extend.data, cmp, TEST_LEN * sizeof(int)) == 0);
+    chi_assert("extend.len != TEST_LEN", extend.len == TEST_LEN);
+    dynarr_destroy(extend);
+
+    int_dynarr_t push = dynarr_initi(int_dynarr_t);
+    for (size_t i = 0; i < TEST_LEN; i++)
+        dynarr_push(push, cmp[i]);
+    chi_assert("push.data != cmp", memcmp(push.data, cmp, TEST_LEN * sizeof(int)) == 0);
+    chi_assert("push.len != TEST_LEN", push.len == TEST_LEN);
+    dynarr_destroy(push);
+
+    int_dynarr_t both = dynarr_initi(int_dynarr_t);
+    dynarr_extend_fixed(both, cmp, TEST_LEN / 2);
+    for (size_t i = TEST_LEN / 2; i < TEST_LEN; i++)
+        dynarr_push(both, cmp[i]);
+    chi_assert("both.data != cmp", memcmp(both.data, cmp, TEST_LEN * sizeof(int)) == 0);
+    chi_assert("both.len != TEST_LEN", both.len == TEST_LEN);
+    dynarr_destroy(both);
+
+    xfree(cmp);
+    return 0;
+}
diff --git a/tests/dynarr_get.c b/tests/dynarr_get.c
new file mode 100644
index 0000000..0895806
--- /dev/null
+++ b/tests/dynarr_get.c
@@ -0,0 +1,13 @@
+#include "util.h"
+#include "unit.h"
+
+#define TEST_LEN 32768
+
+int tests_dynarr_get(int argc, char **argv) {
+    int_dynarr_t arr = dynarr_initi(int_dynarr_t);
+    for (int i = 0; i < 100; i++)
+        dynarr_push(arr, 0);
+    chi_assert("get(5) == arr.data + 5", dynarr_get(arr, 5) == arr.data + 5);
+    dynarr_destroy(arr);
+    return 0;
+}
diff --git a/tests/dynarr_get1_death.c b/tests/dynarr_get1_death.c
new file mode 100644
index 0000000..371f82a
--- /dev/null
+++ b/tests/dynarr_get1_death.c
@@ -0,0 +1,7 @@
+#include "util.h"
+
+int tests_dynarr_get1_death(int argc, char **argv) {
+    int_dynarr_t a = dynarr_initi(int_dynarr_t);
+    dynarr_get(a, 0);
+    return 0;
+}
diff --git a/tests/dynarr_get2_death.c b/tests/dynarr_get2_death.c
new file mode 100644
index 0000000..b233c18
--- /dev/null
+++ b/tests/dynarr_get2_death.c
@@ -0,0 +1,9 @@
+#include "util.h"
+
+int tests_dynarr_get2_death(int argc, char **argv) {
+    int_dynarr_t a = dynarr_initi(int_dynarr_t);
+    for (int i = 0; i < 1000; i++)
+        dynarr_push(a, i);
+    dynarr_get(a, a.len);
+    return 0;
+}
diff --git a/tests/dynarr_get3_death.c b/tests/dynarr_get3_death.c
new file mode 100644
index 0000000..5f7ea2f
--- /dev/null
+++ b/tests/dynarr_get3_death.c
@@ -0,0 +1,10 @@
+#include "util.h"
+
+int tests_dynarr_get3_death(int argc, char **argv) {
+    int_dynarr_t a = dynarr_initi(int_dynarr_t);
+    for (int i = 0; i < 1000; i++)
+        dynarr_push(a, i);
+    int k = -1;
+    dynarr_get(a, k);
+    return 0;
+}
diff --git a/tests/dynarr_insert.c b/tests/dynarr_insert.c
new file mode 100644
index 0000000..c250169
--- /dev/null
+++ b/tests/dynarr_insert.c
@@ -0,0 +1,35 @@
+#include "util.h"
+#include "unit.h"
+
+#define TEST_LEN 32768
+
+int tests_dynarr_insert(int argc, char **argv) {
+    size_dynarr_t increm = dynarr_initi(size_dynarr_t);
+
+    for (size_t i = 0; i < TEST_LEN; i += 2)
+        dynarr_push(increm, i);
+    for (size_t i = 1; i < TEST_LEN; i += 2)
+        dynarr_insert(increm, i, i);
+    chi_assert("arr.len == TEST_LEN", increm.len == TEST_LEN);
+    for (size_t i = 0; i < TEST_LEN; i++)
+        chi_assert("arr[i] == i", *dynarr_get(increm, i) == i);
+
+    dynarr_destroy(increm);
+
+    int_dynarr_t randins = dynarr_initi(int_dynarr_t);
+
+    long long parity = 0, check = 0;
+    
+    for (size_t i = 0; i < TEST_LEN; i++) {
+        int gen = rand() % 10;
+        parity += gen;
+        dynarr_insert(randins, (rand() % (randins.len + 1)), gen);
+    }
+    chi_assert("arr.len == TEST_LEN", randins.len == TEST_LEN);
+    for (size_t i = 0; i < TEST_LEN; i++)
+        check += *dynarr_get(randins, i);
+    chi_assert("parity == check", parity == check);
+
+    dynarr_destroy(randins);
+    return 0;
+}
diff --git a/tests/dynarr_remove.c b/tests/dynarr_remove.c
new file mode 100644
index 0000000..74d7164
--- /dev/null
+++ b/tests/dynarr_remove.c
@@ -0,0 +1,50 @@
+#include "util.h"
+#include "unit.h"
+
+#define TEST_LEN 32768
+
+void check_arr(int check[], size_t check_len) {
+    int_dynarr_t dyn = dynarr_initi(int_dynarr_t);
+
+    dynarr_extend_fixed(dyn, check, check_len);
+    chi_assert("dyn.len == check_len", dyn.len == check_len);
+    chi_assert("dyn.data == check", memcmp(dyn.data, check, check_len * sizeof(int)) == 0);
+
+    for (size_t i = 0; i < dyn.len; i++) {
+        size_t ind = rand() % check_len;
+        dynarr_remove(dyn, ind);
+        memmove(check + ind, check + ind + 1, (check_len - ind - 1) * sizeof(int));
+        check_len -= 1;
+        chi_assert("dyn.len == check_len (modified)", dyn.len == check_len);
+        chi_assert("dyn.data == check (modified)", memcmp(dyn.data, check, check_len * sizeof(int)) == 0);
+    }
+}
+
+int tests_dynarr_remove(int argc, char **argv) {
+    int_dynarr_t randdel = dynarr_initi(int_dynarr_t);
+
+    long long parity = 0;
+    
+    for (size_t i = 0; i < TEST_LEN; i++) {
+        int gen = rand() % 10;
+        parity += gen;
+        dynarr_push(randdel, gen);
+    }
+    long long check = parity;
+    for (size_t i = 0; i < TEST_LEN; i++) {
+        size_t ind = rand() % randdel.len;
+        check -= *dynarr_get(randdel, ind);
+        dynarr_remove(randdel, ind);
+        chi_assert("randdel.len == TEST_LEN - i - 1", randdel.len == TEST_LEN - i - 1);
+    }
+    chi_assert("check == 0", check == 0);
+
+    int *c = xmalloc(TEST_LEN * sizeof(int));
+    for (int i = 0; i < TEST_LEN; i++)
+        c[i] = rand();
+    check_arr(c, TEST_LEN);
+    xfree(c);
+
+    dynarr_destroy(randdel);
+    return 0;
+}
diff --git a/tests/hset_add.c b/tests/hset_add.c
new file mode 100644
index 0000000..89f1a87
--- /dev/null
+++ b/tests/hset_add.c
@@ -0,0 +1,69 @@
+#include <stdbool.h>
+#include <string.h>
+
+#include "util.h"
+#include "unit.h"
+
+size_t charp_parityhash(char **ptr) {
+    return parityhash(*ptr, strlen(*ptr));
+}
+
+bool charp_cmp(char **lhs, char **rhs) {
+    return strcmp(*lhs, *rhs) == 0;
+}
+
+int tests_hset_add(int argc, char **argv) {
+    charp_hset_t hset1;
+    hset_init(charp_hset_t, hset1, charp_parityhash, charp_cmp);
+
+    char *ptr = "hello";
+    chi_assert("\"hello\" should not be apart of the set", !hset_add(hset1, ptr));
+    chi_assert("\"hello\" should be apart of the set", hset_add(hset1, ptr));
+    
+    ptr = "ehllo";
+    chi_assert("\"ehllo\" should not be apart of the set", !hset_add(hset1, ptr));
+    char *heap = xmalloc(strlen(ptr) + 1);
+    strcpy(heap, ptr);
+    chi_assert("\"ehllo\" should be apart of the set", hset_add(hset1, heap));
+    xfree(heap);
+
+    hset_destroy(hset1);
+
+    charp_hset_t hset2;
+    hset_init(charp_hset_t, hset2, charp_parityhash, charp_cmp);
+
+#define MAX_STR_SIZE 3
+    ptr = xmalloc(MAX_STR_SIZE + 2);
+    strcpy(ptr, "0");
+    char buf[100];
+    for(int len = 1; len < MAX_STR_SIZE + 1;) {
+        snprintf(buf, sizeof(buf)-1, "\"%s\" shouldn't be apart of the set", ptr);
+        char *tmp = xmalloc(strlen(ptr) + 1);
+        strcpy(tmp, ptr);
+        chi_assert(buf, !hset_add(hset2, tmp));
+        int c = 1;
+        for (int i = len - 1, s = 0; i >= 0; i--)
+            s = (ptr[i] - '0') + c, c = s > 9, ptr[i] = s % 10 + '0';
+        if (c != 0) {
+            memmove(ptr + 1, ptr, (len++) + 1);
+            *ptr = '1';
+        }
+    }
+    strcpy(ptr, "0");
+    for(int len = 1; len < MAX_STR_SIZE + 1;) {
+        snprintf(buf, sizeof(buf)-1, "\"%s\" should be apart of the set", ptr);
+        chi_assert(buf, hset_find(hset2, ptr));
+        chi_assert(buf, hset_add(hset2, ptr));
+        int c = 1;
+        for (int i = len - 1, s = 0; i >= 0; i--)
+            s = (ptr[i] - '0') + c, c = s > 9, ptr[i] = s % 10 + '0';
+        if (c != 0) {
+            memmove(ptr + 1, ptr, (len++) + 1);
+            *ptr = '1';
+        }
+    }
+    xfree(ptr);
+    hset_destroy(hset2);
+    
+    return 0;
+}
diff --git a/tests/hset_iter.c b/tests/hset_iter.c
new file mode 100644
index 0000000..cea8f0b
--- /dev/null
+++ b/tests/hset_iter.c
@@ -0,0 +1,34 @@
+#include <stdbool.h>
+#include <string.h>
+
+#include "util.h"
+#include "unit.h"
+
+size_t int_hash(int *ptr) {
+    return *ptr;
+}
+
+int tests_hset_iter(int argc, char **argv) {
+    int_hset_t hset1, hset2;
+    hset_init(int_hset_t, hset1, int_hash, NULL);
+    hset_init(int_hset_t, hset2, int_hash, NULL);
+
+    for(int i = 0; i < 1000; i++)
+        hset_add(hset1, i);
+    
+    void *saveptr = NULL;
+    int *data;
+    while ((data = hset_iter(hset2, saveptr)) != NULL)
+        hset_add(hset2, *data);
+
+    char msgbuf[100];
+    for(int i = 0; i < 1000; i++) {
+        snprintf(msgbuf, sizeof(msgbuf), "%d not in hset2", i);
+        chi_assert(msgbuf, hset_find(hset2, i));
+    }
+
+    hset_destroy(hset2);
+    hset_destroy(hset1);
+    
+    return 0;
+}
diff --git a/tests/json_write.c b/tests/json_write.c
new file mode 100644
index 0000000..5f8e720
--- /dev/null
+++ b/tests/json_write.c
@@ -0,0 +1,78 @@
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "json.h"
+#include "unit.h"
+
+#define CMP_STR "{\"string\":\"hello\",\"number\":69,\"boolean\":true,\"null\":" \
+    "null,\"null\":null,\"array\":[\"hello\",\"world\",\"it's\",\"max\"," \
+    "\"flow\",\"with\",\"ryhmes\",\"so-so\",0,-100,false,null],\"object\":{\"hello\":" \
+    "\"dipshit\"}}"
+
+int tests_json_write(int argc, char **argv) {
+    jsonkv_dynarr_t map = dynarr_initi(jsonkv_dynarr_t);
+    jsonkv_t kv;
+    kv = (jsonkv_t){ .key = "string", .val = json_createstr("hello") };
+    dynarr_push(map, kv);
+    kv = (jsonkv_t){ .key = "number", .val = json_createint(69) };
+    dynarr_push(map, kv);
+    kv = (jsonkv_t){ .key = "boolean", .val = json_createbool(true) };
+    dynarr_push(map, kv);
+    kv = (jsonkv_t){ .key = "null", .val = json_createnull() };
+    dynarr_push(map, kv);
+    dynarr_push(map, kv);
+    jsonval_dynarr_t subarr = dynarr_initi(jsonval_dynarr_t);
+    dynarr_push(subarr, json_createstr("hello"));
+    dynarr_push(subarr, json_createstr("world"));
+    dynarr_push(subarr, json_createstr("it's"));
+    dynarr_push(subarr, json_createstr("max"));
+    dynarr_push(subarr, json_createstr("flow"));
+    dynarr_push(subarr, json_createstr("with"));
+    dynarr_push(subarr, json_createstr("ryhmes"));
+    dynarr_push(subarr, json_createstr("so-so"));
+    dynarr_push(subarr, json_createint(0));
+    dynarr_push(subarr, json_createint(-100));
+    dynarr_push(subarr, json_createbool(false));
+    dynarr_push(subarr, json_createnull());
+    jsonval_t j_subarr = json_createarr(subarr);
+    kv = (jsonkv_t){ .key = "array", .val = j_subarr };
+    dynarr_push(map, kv);
+    jsonkv_dynarr_t submap = dynarr_initi(jsonkv_dynarr_t);
+    kv = (jsonkv_t){ .key = "hello", .val = json_createstr("dipshit") };
+    dynarr_push(submap, kv);
+    kv = (jsonkv_t){ .key = "object", .val = json_createobj(submap) };
+    dynarr_push(map, kv);
+    jsonval_t j_map = json_createobj(map);
+
+    int fds[2] = { 11, 12 };
+    if (pipe(fds) < 0) {
+        fprintf(stderr, "pipe() failed: %s\n", strerror(errno));
+        return 1;
+    }
+    FILE *writer = fdopen(fds[1], "w");
+    if (writer == NULL) {
+        fprintf(stderr, "pipe() failed: %s\n", strerror(errno));
+        return 1;
+    }
+    json_write(writer, &j_map);
+    json_destroy(&j_map);
+
+    char buf[1000];
+    fflush(writer);
+    int cnt = read(fds[0], &buf, sizeof(buf));
+    if (cnt < 0) {
+        fprintf(stderr, "read() failed: %s\n", strerror(errno));
+        return 1;
+    }
+    buf[cnt] = '\0';
+    printf("cmp: %s\n", CMP_STR);
+    printf("buf: %s\n", buf);
+    chi_assert("test strings do not match", strcmp(buf, CMP_STR) == 0);
+
+    fclose(writer);
+    close(fds[0]);
+
+    return 0;
+}
diff --git a/tests/robots_txt.c b/tests/robots_txt.c
new file mode 100644
index 0000000..e69de29
diff --git a/tests/unit.h b/tests/unit.h
new file mode 100644
index 0000000..0dd525d
--- /dev/null
+++ b/tests/unit.h
@@ -0,0 +1,3 @@
+#include <stdio.h>
+#include <stdlib.h>
+#define chi_assert(message, test) do { if (!(test)) { fprintf(stderr, "ASSERT FAILED (line %d): %s\n", __LINE__, (message)); exit(1); } } while(0)