initial commit

This commit is contained in:
Ava Pagefault 2024-09-01 10:12:11 -04:00 committed by Ava Pagefault
commit 20c3cd5510
36 changed files with 3323 additions and 0 deletions

4
.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
build/
.cache/
result
outputs/

52
CMakeLists.txt Normal file
View File

@ -0,0 +1,52 @@
cmake_minimum_required(VERSION 3.10)
set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON CACHE INTERNAL "")
set(CMAKE_C_FLAGS_RELEASE_INIT "-Wall -Wextra -Wpedantic -Wno-language-extension-token -Wno-gnu-statement-expression-from-macro-expansion")
set(CMAKE_C_FLAGS_DEBUG_INIT "${CMAKE_C_FLAGS_RELEASE_INIT} -gdwarf-4")
project(Spider2 VERSION 1.0)
list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
find_package(CURL REQUIRED)
# Main executable
file(GLOB_RECURSE srcFiles src/*.c)
add_executable(${PROJECT_NAME} ${srcFiles})
target_link_libraries(${PROJECT_NAME} CURL::libcurl)
target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
target_link_options(${PROJECT_NAME} PRIVATE -ltidy)
# Tests
include(CTest)
#file(GLOB_RECURSE testsToRun tests/*.c)
#list(FILTER srcFiles EXCLUDE REGEX main.c)
set(testsToRun tests/dynarr_extensions.c tests/dynarr_get.c tests/dynarr_get1_death.c tests/dynarr_get2_death.c tests/dynarr_get3_death.c tests/dynarr_insert.c tests/deque_push.c tests/deque_pop.c tests/json_write.c tests/hset_iter.c tests/hset_add.c)
create_test_sourcelist(tests CommonTests.c ${testsToRun})
add_executable(CommonTests ${tests} src/util.c src/json.c)
target_link_libraries(CommonTests CURL::libcurl)
target_include_directories(CommonTests PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
target_compile_options(CommonTests PRIVATE -gdwarf-4)
target_link_options(CommonTests PRIVATE -gdwarf-4)
#target_compile_options(CommonTests PRIVATE -gdwarf-4 -pg)
#target_link_options(CommonTests PRIVATE -gdwarf-4 -pg)
foreach(testFile IN LISTS testsToRun)
get_filename_component(testName ${testFile} NAME_WE)
add_test(NAME ${testName} COMMAND CommonTests tests/${testName})
string(REGEX MATCH "_death$" shouldDie ${testName})
if(shouldDie)
set_property(TEST ${testName} PROPERTY WILL_FAIL ON)
endif()
endforeach()
# Install rules
install(TARGETS Spider2)

12
README.md Normal file
View File

@ -0,0 +1,12 @@
# Shitty http spider in C
lol
## Build Instructions
In order to build this, you need: cmake, curl, tidy, a c compiler, as well as all of the appropriate development header packages.
```
mkdir build
cd build
cmake -S .. -B . -DCMAKE_BUILD_TYPE=Release
cmake --build .
```
After you built it, the binary should be in `./build/Spider2` relative to the repo root.
To run the tests, run `ctest` in `./build` relative to the repo root.

27
flake.lock Normal file
View File

@ -0,0 +1,27 @@
{
"nodes": {
"nixpkgs": {
"locked": {
"lastModified": 1718350267,
"narHash": "sha256-hrf/m9msEun15Vbs8+IOijFe4Sb58KxG/BnDSL9xgZQ=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "ecbc30d5ed9f75449233b17d4a4cdeab53af793f",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "release-24.05",
"repo": "nixpkgs",
"type": "github"
}
},
"root": {
"inputs": {
"nixpkgs": "nixpkgs"
}
}
},
"root": "root",
"version": 7
}

56
flake.nix Normal file
View File

@ -0,0 +1,56 @@
{
inputs.nixpkgs.url = "github:NixOS/nixpkgs/release-24.05";
outputs = { self, nixpkgs }:
let
defaultArchs = [ "x86_64-linux" "x86_64-darwin" "aarch64-linux" "aarch64-darwin" ];
forAllSystems = nixpkgs.lib.attrsets.genAttrs defaultArchs;
nixpkgsFor = forAllSystems (system: import nixpkgs { inherit system; overlays = [ self.overlay ]; });
in
{
overlay = final: prev: {
spider2 = final.stdenv.mkDerivation {
name = "Spider2";
src = ./.;
nativeBuildInputs = with final; [
cmake
pkg-config
];
buildInputs = with final; [ curl html-tidy ];
dontUnpack = true;
configurePhase = ''
runHook preConfigure
cmake -S $src -B . -DCMAKE_BUILD_TYPE=DEBUG
runHook postConfigure
'';
buildPhase = ''
runHook preBuild
cmake --build .
runHook postBuild
'';
installPhase = ''
runHook preInstall
cmake --install . --prefix $out
runHook postInstall
'';
};
spider2WithDebug = final.spider2.overrideAttrs (_: _: {
hardeningDisable = [ "all" ];
dontStrip = true;
});
};
packages = forAllSystems (system: {
default = nixpkgsFor."${system}".spider2;
inherit (nixpkgsFor."${system}") spider2WithDebug;
});
devShells = forAllSystems (system:
let
pkgs = nixpkgsFor."${system}";
debugPkgs = with pkgs; [ clang-tools valgrind gdb ];
in
{
default = pkgs.spider2WithDebug.overrideAttrs (finalAttrs: previousAttrs: {
nativeBuildInputs = previousAttrs.nativeBuildInputs ++ debugPkgs;
});
});
};
}

14
include/crawler.h Normal file
View File

@ -0,0 +1,14 @@
#ifndef __CRAWLER_H_
#define __CRAWLER_H_
#include "module.h"
typedef struct {
const char **allowedhosts;
double req_interval_s;
moduleentryp_dynarr_t enabledmodules;
} crawlerconfig_t;
void crawler(const char *seed, const crawlerconfig_t *config);
#endif

46
include/http.h Normal file
View File

@ -0,0 +1,46 @@
#ifndef __HTTP_H_
#define __HTTP_H_
#include <curl/curl.h>
#include <stdbool.h>
#include "util.h"
#include "module.h"
typedef struct {
int status, flags, num_requests;
} headercb_data_t;
typedef struct {
byte *base, *begin, *end;
moduleentryp_dynarr_t *modules;
const char *url;
bool wasrequested;
} writecb_data_t;
typedef struct {
writecb_data_t writecb_data;
headercb_data_t headercb_data;
} cbdata_t;
#define HEADERCB_VALID_MIME (1 << 0)
#define HEADERCB_CONTENT_TYPE_ENCOUNTERED (1 << 1)
headercb_data_t http_get_to_buf(CURLcode *res, CURL *curl, byte **cnt, size_t *cntlen);
char *relative2absolute(CURLU *curl_url_h, const char *parent, const char *relative);
charp_dynarr_t parsehrefs(CURLU *curl_url_h, const char *url, const char *page, size_t pagelen);
extern const char *useragents[];
typedef struct {
const char *host;
struct curl_slist *headers;
int totalfailurecnt, failurecnt, visitcnt;
} hostentry_t;
dynarr_def(hostentry_t, host_dynarr_t);
CURL *makehandle(const char *url, hostentry_t *host_entry, cbdata_t *cbdata, bool wasrequested);
void initcbdata(const char *url, moduleentryp_dynarr_t *modules, cbdata_t *data);
#endif

40
include/json.h Normal file
View File

@ -0,0 +1,40 @@
#ifndef __JSON_H_
#define __JSON_H_
#include <stdio.h>
#include <stdbool.h>
#include "util.h"
typedef enum {
JSON_OBJECT,
JSON_ARRAY,
JSON_STRING,
JSON_INT,
JSON_BOOL,
JSON_NULL,
} jsontype_t;
typedef struct jsonval {
jsontype_t type;
void *data;
} jsonval_t;
typedef struct {
const char *key;
jsonval_t val;
} jsonkv_t;
dynarr_def(jsonval_t, jsonval_dynarr_t);
dynarr_def(jsonkv_t, jsonkv_dynarr_t);
jsonval_t json_createobj(jsonkv_dynarr_t pairs);
jsonval_t json_createarr(jsonval_dynarr_t elems);
jsonval_t json_createstr(const char *str);
jsonval_t json_createint(long num);
jsonval_t json_createbool(bool val);
jsonval_t json_createnull(void);
void json_destroy(jsonval_t *val);
void json_write(FILE *out, jsonval_t *val);
#endif

82
include/module.h Normal file
View File

@ -0,0 +1,82 @@
#ifndef __MODULE_H_
#define __MODULE_H_
#include <curl/curl.h>
#include "util.h"
typedef enum {
FILTER_PASS,
FILTER_STALL,
FILTER_REJECT,
} filterres_t;
typedef enum {
EXTRA_JSON,
EXTRA_TIDY,
EXTRA_OTHER,
} extradata_type_t;
typedef struct {
extradata_type_t type;
char *key;
void *val;
} extradata_t;
dynarr_def(extradata_t, extradata_dynarr_t);
typedef struct {
const char *url;
CURL *handle;
char *page;
size_t npage;
extradata_dynarr_t extradata;
charp_dynarr_t *parsedlinks;
} pagecompletedata_t;
typedef void (*reqcb_t)(void *userdata, const char *url, char *page, size_t npage, CURL *handle);
typedef struct {
const char *url;
reqcb_t cb;
void *userdata;
CURL *handle;
} requestedreq_t;
dynarr_def(requestedreq_t, requestedreq_dyanrr_t);
typedef struct crawlermodule {
void *userdata;
// `init` will both initialize the module, and populate all other functions in its entry, if necessary.
int (*init)(struct crawlermodule *entry);
int (*destroy)(void *userdata);
int (*onpagewrite)(void *userdata, const char *url, const byte *data, size_t ndata);
int (*onpagecomplete)(void *userdata, pagecompletedata_t *data);
int (*onpagedestroy)(void *userdata, pagecompletedata_t *data);
filterres_t (*filter)(void *userdata, const char *url);
} crawlermodule_t;
dynarr_def(crawlermodule_t, crawlermodule_dynarr_t);
dynarr_def(crawlermodule_t *, crawlermodulep_dynarr_t);
typedef struct {
const char *name;
crawlermodule_t module;
} moduleentry_t;
dynarr_def(moduleentry_t, moduleentry_dynarr_t);
dynarr_def(moduleentry_t *, moduleentryp_dynarr_t);
void *searchextradata(extradata_type_t type, char *key, extradata_t *data, size_t ndata);
void makerequest(const char *url, reqcb_t cb, void *cbdata);
int mod_pagedata_init(crawlermodule_t *entry);
int mod_tidy_init(crawlermodule_t *entry);
int mod_debug_init(crawlermodule_t *entry);
int mod_parse_init(crawlermodule_t *entry);
int mod_robots_init(crawlermodule_t *entry);
extern requestedreq_dyanrr_t requestedreqs;
extern moduleentry_t availmodules[];
#endif

358
include/util.h Normal file
View File

@ -0,0 +1,358 @@
#ifndef __UTIL_H_
#define __UTIL_H_
#include <stdio.h>
#include <stddef.h>
#include <stdbool.h>
#include <string.h>
#include <stdint.h>
typedef enum {
LEVEL_DEBUG,
LEVEL_INFO,
LEVEL_WARN,
LEVEL_ERROR,
LEVEL_FATAL,
} loglevel_t;
void volog(loglevel_t level, const char *file, int line, const char *fmt, va_list ap);
void olog(loglevel_t level, const char *file, int line, const char *fmt, ...);
void die(const char *fmt, ...);
void *xmalloc(size_t size);
void *xcalloc(size_t nmemb, size_t size);
void *xrealloc(void *ptr, size_t size);
void xfree(void *ptr);
size_t sanitize2ascii(char *out, const char *inp, size_t outsize);
char *sanitize2ascii_dyn(const char *inp, size_t maxlen);
// wikipedia sample function. read comment in util.c
uint32_t murmur3_32(const uint8_t* key, size_t len, uint32_t seed);
bool hset_charp_cmp(char **lhs, char **rhs);
size_t hset_charp_hash(char **str);
#define min(A, B) \
({ __typeof__ (A) _a = (A); \
__typeof__ (B) _b = (B); \
_a < _b ? _a : _b; })
#define max(A, B) \
({ __typeof__ (A) _a = (A); \
__typeof__ (B) _b = (B); \
_a > _b ? _a : _b; })
#define DYNARR_INIT_CAP 16
#define DEQUE_INIT_CAP 16
#define HSET_INIT_CAP 256
#define debug(...) olog(LEVEL_DEBUG, __FILE__, __LINE__, __VA_ARGS__)
#define info(...) olog(LEVEL_INFO,__FILE__, __LINE__, __VA_ARGS__)
#define warn( ...) olog(LEVEL_WARN, __FILE__, __LINE__, __VA_ARGS__)
#define error(...) olog(LEVEL_ERROR, __FILE__, __LINE__, __VA_ARGS__)
#define fatal(...) olog(LEVEL_FATAL, __FILE__, __LINE__, __VA_ARGS__)
#define array_size(ARR) (sizeof(ARR) / sizeof(typeof(*ARR)))
#define dynarr_def(T, NAME) typedef DYNARR(T) NAME; typedef T NAME ## _innertype
#define DYNARR(T) struct { \
T* data; \
size_t len, cap; \
}
#define dynarr_initi(ARR_T) (ARR_T){ .data = xmalloc(sizeof(ARR_T ## _innertype) * DYNARR_INIT_CAP), .len = 0, .cap = DYNARR_INIT_CAP }
#define dynarr_init(ARR_T, ARR) (ARR) = dynarr_initi(ARR_T)
#define dynarr_destroy(ARR) do { \
if ((ARR).data == NULL) \
break; \
xfree((ARR).data); \
(ARR).data = NULL; \
} while(0)
#define dynarr_push(ARR, ELEM) do { \
if ((ARR).len >= (ARR).cap) { \
(ARR).cap *= 2; \
(ARR).data = xrealloc((ARR).data, (ARR).cap * sizeof(typeof(ELEM))); \
} \
(ARR).data[(ARR).len] = (ELEM); \
(ARR).len += 1; \
} while(0)
#define dynarr_get(ARR, INDEX) ({ \
size_t index = (INDEX); \
if (index >= (ARR).len)\
die("dyn array out of bounds access"); \
&(ARR).data[index]; })
#define dynarr_pop(ARR) ({ \
if ((ARR).len < 1) \
die("dyn array empty array pop"); \
(ARR).data[--(ARR).len]; })
#define dynarr_insert(ARR, INDEX, ELEM) do { \
size_t index = (INDEX); \
if (index > (ARR).len) \
die("dyn array out of bounds insert"); \
if ((ARR).len >= (ARR).cap) { \
(ARR).cap *= 2; \
(ARR).data = xrealloc((ARR).data, (ARR).cap * sizeof(ELEM)); \
} \
memmove((ARR).data + index + 1, (ARR).data + index, ((ARR).len - index) * sizeof(ELEM)); \
(ARR).data[index] = (ELEM); \
(ARR).len += 1; \
} while (0)
#define dynarr_remove(ARR, INDEX) do { \
size_t index = INDEX;\
if ((index) >= (ARR).len) \
die("dyn array out of bounds remove"); \
memmove((ARR).data + index, (ARR).data + index + 1, ((ARR).len - index - 1) * sizeof(*(ARR).data)); \
(ARR).len -= 1; \
} while (0)
#define dynarr_extend_fixed(DYN_ARR, FIXED_ARR, NMEMB) do { \
size_t nmemb = NMEMB; \
size_t new_cap = (DYN_ARR).cap; \
while ((DYN_ARR).len + nmemb > new_cap) \
new_cap *= 2; \
if (new_cap > (DYN_ARR).cap) { \
(DYN_ARR).cap = new_cap; \
(DYN_ARR).data = xrealloc((DYN_ARR).data, (DYN_ARR).cap * sizeof(*(DYN_ARR).data)); \
} \
memcpy((DYN_ARR).data + (DYN_ARR).len, FIXED_ARR, nmemb * sizeof(*(DYN_ARR).data)); \
(DYN_ARR).len += nmemb; \
} while(0)
#define dynarr_extend_dyn(LHS, RHS) do { \
size_t new_cap = (LHS).cap; \
while ((LHS).len + (RHS).len > new_cap) \
new_cap *= 2; \
if (new_cap > (LHS).cap) { \
(LHS).cap = new_cap; \
(LHS).data = xrealloc((LHS).data, (LHS).cap * sizeof(*(LHS).data)); \
} \
memcpy((LHS).data + (LHS).len, (RHS).data, (RHS).len * sizeof(*(LHS).data)); \
(LHS).len += (RHS).len; \
} while(0)
dynarr_def(size_t, size_dynarr_t);
dynarr_def(int, int_dynarr_t);
dynarr_def(long, long_dynarr_t);
dynarr_def(long long, long_long_dynarr_t);
dynarr_def(char *, charp_dynarr_t);
dynarr_def(char, char_dynarr_t);
dynarr_def(void *, vp_dynarr_t);
#define DEQUE(T) struct { \
T* base;\
size_t cap, front, back, len; \
} \
#define deque_def(T, NAME) typedef DEQUE(T) NAME; typedef T NAME ## _innertype
//#define deque_init(DEQ_T) { .base = xmalloc(DEQUE_INIT_CAP * sizeof(DEQ_T ## _innertype)), .cap = DEQUE_INIT_CAP, .front = 0, .back = 0, .len = 0 }
#define deque_init(DEQ_T, DEQ) do { (DEQ).base = xmalloc(DEQUE_INIT_CAP * sizeof(DEQ_T ## _innertype)), (DEQ).cap = DEQUE_INIT_CAP, (DEQ).front = 0, (DEQ).back = 0, (DEQ).len = 0; } while (0)
#define deque_destroy(DEQ) do { \
if ((DEQ).base == NULL)\
break; \
xfree((DEQ).base); \
(DEQ).base = NULL; \
} while (0)
#define deque_grow(DEQ, NEW_CAP) do { \
if ((DEQ).cap >= NEW_CAP) \
continue; \
size_t new_cap = NEW_CAP, size = sizeof(typeof(*(DEQ).base)); \
typeof((DEQ).base) new_base = xmalloc(new_cap * size); \
if ((DEQ).len > 0 && (DEQ).front >= (DEQ).back) { \
size_t end_len = (DEQ).cap - (DEQ).front; \
memcpy(new_base, (DEQ).base + (DEQ).front, end_len * size);\
memcpy(new_base + end_len, (DEQ).base, (DEQ).back * size); \
} \
else { \
memcpy(new_base, (DEQ).base + (DEQ).front, (DEQ).len * size); \
} \
xfree((DEQ).base); \
(DEQ).base = new_base, (DEQ).cap = new_cap, (DEQ).front = 0, (DEQ).back = (DEQ).len; \
} while (0)
#define deque_push_back(DEQ, ELEM) do { \
if ((DEQ).len > 0 && (DEQ).back == (DEQ).front) \
deque_grow(DEQ, (DEQ).cap * 2); \
(DEQ).base[(DEQ).back] = (ELEM); \
(DEQ).back = ((DEQ).back + 1) % (DEQ).cap; \
(DEQ).len += 1; \
} while (0)
#define deque_pop_back(DEQ) ({ \
if ((DEQ).len == 0) \
die("deque empty pop back"); \
(DEQ).back = (DEQ).back == 0 ? (DEQ).cap - 1 : (DEQ).back - 1; \
(DEQ).len -= 1;\
(DEQ).base[(DEQ).back]; \
})
#define deque_push_front(DEQ, ELEM) do { \
if ((DEQ).len > 0 && (DEQ).back == (DEQ).front) \
deque_grow(DEQ, (DEQ).cap * 2); \
(DEQ).front = (DEQ).front == 0 ? (DEQ).cap - 1 : (DEQ).front - 1; \
(DEQ).len += 1; \
(DEQ).base[(DEQ).front] = (ELEM); \
} while (0)
#define deque_pop_front(DEQ) ({ \
if ((DEQ).len == 0) \
die("deque empty pop front"); \
size_t old_front = (DEQ).front; \
(DEQ).front = ((DEQ).front + 1) % (DEQ).cap; \
(DEQ).len -= 1;\
(DEQ).base[old_front]; \
})
#define deque_get(DEQ, INDEX) ({ \
size_t index = (INDEX); \
if (index >= (DEQ).len) \
die("deque out of bounds access"); \
&(DEQ).base[((DEQ).front + index) % (DEQ).cap]; \
})
#define deque_clone(DST, SRC) ({ \
memcpy(&(DST), &(SRC), sizeof(DST)); \
(DST).base = xmalloc((SRC).len * sizeof(typeof(*(DST).base))); \
memcpy((DST).base, (SRC).base, (SRC).len * sizeof(typeof(*(DST).base))); \
})
deque_def(int, int_deque_t);
deque_def(size_t, size_deque_t);
deque_def(long, long_deque_t);
deque_def(long long, longlong_deque_t);
deque_def(char, char_deque_t);
deque_def(char *, charp_deque_t);
deque_def(void *, voidp_deque_t);
size_t parityhash(const void *data, size_t ndata);
#define HSET_BUCKET(T, NAME) struct NAME ## _struct { \
struct NAME ## _struct *next; \
T data; \
}
#define HSET(T, BUCKET_T) struct { \
BUCKET_T** buckets; \
size_t nbuckets, len; \
size_t (*algo)(T*); \
bool (*cmp)(T*, T*); \
}
#define hset_def(T, NAME) typedef HSET_BUCKET(T, NAME ## _bucket) NAME ## _bucket; \
typedef HSET(T, NAME ## _bucket) NAME; \
typedef T NAME ## _innertype
#define hset_initi(HSET_T, ALGO, CMP) (HSET_T) { \
.buckets = xcalloc(HSET_INIT_CAP, sizeof(HSET_T ## _bucket *)), \
.nbuckets = HSET_INIT_CAP, .len = 0, .algo = (ALGO), .cmp = (CMP), \
}
#define hset_init(HSET_T, HSET, ALGO, CMP) (HSET) = hset_initi(HSET_T, ALGO, CMP)
#define hset_destroy(HSET) do { \
if ((HSET).buckets == NULL) \
break; \
for (size_t i = 0; i < (HSET).nbuckets; i++) { \
if ((HSET).buckets[i] == NULL) \
continue; \
typeof(*(HSET).buckets) cur = (HSET).buckets[i], next;\
for (; cur != NULL; cur = next) { \
next = cur->next; \
xfree(cur); \
} \
} \
xfree((HSET).buckets); \
(HSET).buckets = NULL; \
} while (0)
// c has reminded me of how good i had it with rust
#define hset_iter(HSET, STATE) ({ \
typeof(&(HSET).buckets[0]->data) ret;\
do { \
if ((STATE) == NULL) {\
(STATE) = xmalloc(2 * sizeof(void*)); \
((void**) (STATE))[0] = (HSET).buckets; \
((void**) (STATE))[1] = NULL; \
} \
typeof(&(HSET).buckets) entryptr = &((typeof(&(HSET).buckets)) (STATE))[0];\
typeof((HSET).buckets) bucketptr = &((typeof((HSET).buckets)) (STATE))[1];\
if (*bucketptr != NULL && (*bucketptr)->next != NULL) { \
*bucketptr = (*bucketptr)->next; \
ret = &(*bucketptr)->data; \
break; \
} \
*bucketptr = NULL; \
for (; *entryptr < (HSET).buckets + (HSET).nbuckets; (*entryptr)++) \
if (**entryptr != NULL) \
break; \
if (*entryptr < (HSET).buckets + (HSET).nbuckets) { \
*bucketptr = **entryptr; \
(*entryptr)++; \
ret = &(*bucketptr)->data; \
} else { \
xfree((STATE)); \
(STATE) = NULL; \
ret = NULL; \
} \
} while (0); \
ret; \
})
#define hset_add(HSET, ELEM) ({ \
size_t ind = (HSET).algo(&(ELEM)) % (HSET).nbuckets; \
bool ret = false; \
if ((HSET).buckets[ind] == NULL) {\
(HSET).buckets[ind] = xmalloc(sizeof(typeof(**(HSET).buckets))); \
(HSET).buckets[ind]->data = ELEM; \
(HSET).buckets[ind]->next = NULL; \
} \
else { \
typeof(*(HSET).buckets) cur, prev; \
for (cur = (HSET).buckets[ind]; !ret && cur != NULL; prev = cur, cur = cur->next) { \
if ((HSET).cmp == NULL) { \
if(cur->data == (ELEM)) \
ret = true; \
} \
else if ((*(HSET).cmp)(&cur->data, &(ELEM))) { \
ret = true; \
} \
} \
if (!ret) { \
prev->next = xmalloc(sizeof(typeof(**(HSET).buckets))); \
prev->next->data = ELEM; \
prev->next->next = NULL; \
} \
} \
ret; \
})
#define hset_find(HSET, ELEM) ({ \
size_t ind = (HSET).algo(&(ELEM)) % (HSET).nbuckets; \
bool ret = false; \
typeof(*(HSET).buckets) cur; \
for (cur = (HSET).buckets[ind]; !ret && cur != NULL; cur = cur->next) { \
if ((HSET).cmp == NULL) { \
if(cur->data == (ELEM)) \
ret = true; \
} \
else if ((*(HSET).cmp)(&cur->data, &(ELEM))) { \
ret = true; \
} \
} \
ret; \
})
hset_def(int, int_hset_t);
hset_def(char *, charp_hset_t);
typedef unsigned char byte;
#endif

794
src/crawler.c Normal file
View File

@ -0,0 +1,794 @@
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <stdbool.h>
#include <signal.h>
#include <time.h>
#include <errno.h>
#include <unistd.h>
#include <sys/epoll.h>
#include <curl/curl.h>
#include "curl/multi.h"
#include "http.h"
#include "util.h"
#include "crawler.h"
#include "json.h"
#include "module.h"
#define MAX_FAILURE_CNT 5
#define MAX_CONNECTIONS 32
#define MAX_CONNECTIONS_PER_HOST 5
#define EPOLL_BUF_SIZE MAX_CONNECTIONS
#define DEATH_TIMEOUT_S 30
volatile sig_atomic_t shoulddie = 0;
volatile struct timespec fatalsigrecved = { 0 };
void fatalsighandler(int status) {
struct timespec now;
// clock_gettime is safe to call within a signal handler
if (clock_gettime(CLOCK_MONOTONIC, &now) < 0)
// die calls fprintf which is not safe to call within a signal handler, but we're going
// to die anyways so does it really matter?
die("clock_gettime failed: %s (code %d)", strerror(errno), errno);
if (shoulddie == 0) {
fatalsigrecved = now;
shoulddie = 1;
}
if (now.tv_sec - fatalsigrecved.tv_sec > DEATH_TIMEOUT_S)
die("death timeout exceeded (%ds)", DEATH_TIMEOUT_S);
}
int perf_epoll_wait (int __epfd, struct epoll_event *__events,
int __maxevents, int __timeout) {
return epoll_wait(__epfd, __events, __maxevents, __timeout);
}
CURLMcode perf_curl_multi_socket_action(CURLM *multi_handle,
curl_socket_t s,
int ev_bitmask,
int *running_handles) {
return curl_multi_socket_action(multi_handle, s, ev_bitmask, running_handles);
}
bool is_allowed_host(const char *host, const char **allowed_hosts) {
if (allowed_hosts == NULL)
return true;
for (int i = 0; allowed_hosts[i] != NULL; i++)
if (strcmp(host, allowed_hosts[i]) == 0)
return true;
return false;
}
CURLUcode append2tocrawl(CURLU *curl_url_h, char *link, const char **allowed_hosts, charp_deque_t *to_crawl) {
CURLUcode url_res;
url_res = curl_url_set(curl_url_h, CURLUPART_URL, link, 0);
if (url_res != CURLUE_OK){
char sanitized[100];
sanitize2ascii(sanitized, link, sizeof(sanitized));
error("URL parsing failed for \"%s\": %s (code %d)",
sanitized, curl_url_strerror(url_res), url_res);
return url_res;
}
char *curl_link_host = NULL;
url_res = curl_url_get(curl_url_h, CURLUPART_HOST, &curl_link_host, CURLU_PUNYCODE);
if (url_res != CURLUE_OK) {
error("URL host parsing failed: %s (code %d)", curl_url_strerror(url_res), url_res);
return url_res;
}
if (!is_allowed_host(curl_link_host, allowed_hosts)) {
char sanitized[100];
sanitize2ascii(sanitized, link, sizeof(sanitized));
error("URL not in allowed hosts \"%s\", not crawling", sanitized);
curl_free(curl_link_host);
return CURLUE_BAD_HOSTNAME;
}
curl_free(curl_link_host);
deque_push_back(*to_crawl, link);
return CURLUE_OK;
}
#define MAX_SOCKETS_PER_TRANS 4
typedef struct transfer {
struct transfer *next;
CURL *handle;
cbdata_t cbdata;
char *url;
int hostentryind;
} transfer_t;
typedef struct {
const crawlerconfig_t *config;
charp_deque_t to_crawl;
charp_dynarr_t visited;
host_dynarr_t hostentries;
transfer_t *headtransfer;
int epollfd;
long timeout_ms;
CURLM *curl_multi_h;
int_dynarr_t sockets;
moduleentryp_dynarr_t modules;
} crawlerstate_t;
typedef enum {
TRANS_OK,
TRANS_ERROR,
TRANS_VISITED,
TRANS_HOST_ERROR_EXCEEDED,
TRANS_HOST_ERROR,
TRANS_WRITE_ERR,
TRANS_FATAL,
} transfercode_t;
int socketcb(CURL *easy, curl_socket_t s, int what, crawlerstate_t *clientp, transfer_t *socketp) {
//printf("socketcb(%p, %d, %d, %p, %p)\n", easy, s, what, (void*)clientp, (void*)socketp);
struct epoll_event event = { .data = { .fd = s } };
int sockind = -1;
for (size_t i = 0; i < clientp->sockets.len; i++) {
if (s == clientp->sockets.data[i]) {
sockind = i;
break;
}
}
switch (what) {
case CURL_POLL_NONE:
return 0;
case CURL_POLL_IN:
event.events = EPOLLIN;
break;
case CURL_POLL_OUT:
event.events = EPOLLOUT;
break;
case CURL_POLL_INOUT:
event.events = EPOLLIN | EPOLLOUT;
break;
case CURL_POLL_REMOVE:
if (sockind < 0) {
fatal("invalid socket specified!");
return -1;
}
if (epoll_ctl(clientp->epollfd, EPOLL_CTL_DEL, s, NULL) < 0) {
fatal("epoll_ctl failed: %s (code %d)", strerror(errno), errno);
return -1;
}
dynarr_remove(clientp->sockets, sockind);
return 0;
}
int op = sockind < 0 ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
if (epoll_ctl(clientp->epollfd, op, s, &event) < 0) {
fatal("epoll_ctl failed: %s (code %d)", strerror(errno), errno);
return -1;
}
if (op == EPOLL_CTL_ADD)
dynarr_push(clientp->sockets, s);
return 0;
}
int timercb(CURLM *multi, long timeout_ms, long *clientp) {
*clientp = timeout_ms;
return 0;
}
bool gethostdata(const char *host, const char **avail_uas, host_dynarr_t *host_data, int *hostentryind) {
const char *picked_ua = NULL;
for (size_t i = 0; i < host_data->len; i++) {
if (strcmp(host, host_data->data[i].host) == 0) {
*hostentryind = i;
return true;
}
}
if (avail_uas == NULL) {
*hostentryind = -1;
return false;
}
// pick one
size_t navail_uas = 0;
for (; avail_uas[navail_uas] != NULL; navail_uas++)
;
picked_ua = avail_uas[(size_t) random() % navail_uas];
const char prefix[] = "User-Agent: ";
char *new_buf = xmalloc(strlen(picked_ua) + sizeof(prefix));
*new_buf = '\0';
strcpy(new_buf, prefix);
strcat(new_buf, picked_ua);
struct curl_slist *headers = NULL;
if ((headers = curl_slist_append(headers, new_buf)) == NULL) {
xfree(new_buf);
curl_slist_free_all(headers);
*hostentryind = -1;
return false;
}
hostentry_t new_entry = {
.host = host,
.headers = headers,
.totalfailurecnt = 0,
.failurecnt = 0,
.visitcnt = 0
};
dynarr_push(*host_data, new_entry);
*hostentryind = host_data->len - 1;
return false;
}
bool didvisit(const char *url, charp_dynarr_t *visited) {
int lhslen = strlen(url);
for (size_t i = 0; i < visited->len; i++) {
int rhslen = strlen(visited->data[i]);
if (lhslen != rhslen)
continue;
if (memcmp(url, visited->data[i], lhslen) == 0)
return true;
}
return false;
}
transfercode_t starttransfer(CURLU *curl_url_h, crawlerstate_t *state, char *url,
transfer_t **transfer_ret, bool wasrequested) {
char sanitized[100];
if (didvisit(url, &state->visited))
// DO NOT free the URL. We are going to use it in our JSON later
return TRANS_VISITED;
dynarr_push(state->visited, url);
sanitize2ascii(sanitized, url, sizeof(sanitized));
info("crawling \"%s\"", sanitized);
// Retrieves handle and host data
*transfer_ret = xmalloc(sizeof(transfer_t));
memset(*transfer_ret, 0, sizeof(transfer_t));
transfer_t *transfer = *transfer_ret;
char *curl_host = NULL;
CURLUcode url_res;
if ((url_res = curl_url_set(curl_url_h, CURLUPART_URL, url, 0)) != CURLUE_OK ||
(url_res = curl_url_get(curl_url_h, CURLUPART_HOST, &curl_host, CURLU_PUNYCODE) != CURLUE_OK)) {
fatal("URL host parsing failed: %s (code %d)", curl_url_strerror(url_res), url_res);
curl_free(curl_host);
xfree(transfer);
return TRANS_FATAL;
}
int hostentryind;
bool wascached = gethostdata(curl_host, useragents, &state->hostentries, &hostentryind);
if (hostentryind < 0) {
// host is (should be) in punycode so it's fine if we don't sanitize it
fatal("Failed to get host entry for host \"%s\"", curl_host);
curl_free(curl_host);
xfree(transfer);
return TRANS_FATAL;
}
if (wascached)
curl_free(curl_host);
hostentry_t *hostentry = &state->hostentries.data[hostentryind];
initcbdata(url, &state->modules, &transfer->cbdata);
transfer->hostentryind = hostentryind;
transfer->url = url;
transfer->handle = makehandle(url, hostentry, &transfer->cbdata, wasrequested);
if (transfer->handle == NULL) {
fatal("makehandle() failed");
xfree(transfer->cbdata.writecb_data.base);
xfree(transfer);
return TRANS_FATAL;
}
if (hostentry->failurecnt > MAX_FAILURE_CNT) {
error("Max failure count (%d) for host exceeded", MAX_FAILURE_CNT);
curl_easy_cleanup(transfer->handle);
xfree(transfer->cbdata.writecb_data.base);
xfree(transfer);
return TRANS_HOST_ERROR_EXCEEDED;
}
// Adds the transfer to be the multi handle tbh idk what im doing
CURLMcode mc;
mc = curl_multi_add_handle(state->curl_multi_h, transfer->handle);
if (mc) {
// erm,what the sigma?
fatal("curl_multi_add_handle failed: %s (code %d)", curl_multi_strerror(mc), mc);
curl_easy_cleanup(transfer->handle);
xfree(transfer->cbdata.writecb_data.base);
xfree(transfer);
return TRANS_FATAL;
}
// Add the transfer to the end of the transfer list
if (state->headtransfer == NULL) {
state->headtransfer = transfer;
}
else {
transfer_t *last = state->headtransfer;
for (; last->next != NULL; last = last->next)
;
last->next = transfer;
}
return TRANS_OK;
}
void destroytransfer(CURLM *curl_multi_h, transfer_t *trans) {
curl_multi_remove_handle(curl_multi_h, trans->handle);
curl_easy_cleanup(trans->handle);
if (trans->cbdata.writecb_data.base != NULL)
xfree(trans->cbdata.writecb_data.base);
xfree(trans);
}
// TODO: Extract arguments into a struct
transfercode_t transferfinished(CURLU *curl_url_h, crawlerstate_t *state,
CURLMsg *msg, transfer_t *trans, charp_dynarr_t *links, jsonkv_dynarr_t *extrajson,
reqcb_t cb, void *cb_userdata) {
CURLcode trans_res = msg->data.result;
char sanitized[100];
hostentry_t *hostentry = &state->hostentries.data[trans->hostentryind];
hostentry->visitcnt++;
// Check if transfer went OK
if (trans_res != CURLE_OK) {
sanitize2ascii(sanitized, trans->url, sizeof(sanitized));
transfercode_t ret;
headercb_data_t *headerdata;
switch (trans_res) {
case CURLE_WRITE_ERROR:
headerdata = &trans->cbdata.headercb_data;
if (!(headerdata->flags & HEADERCB_VALID_MIME))
info("request failed to \"%s\": is not of mime type text/html, not crawling", sanitized);
else if (headerdata->status != 200 && headerdata->status > 0)
info("request failed to \"%s\": returned status code %d", headerdata->status, sanitized);
else
info("request failed to \"%s\": header parsing error or page too big", sanitized);
ret = TRANS_WRITE_ERR;
break;
case CURLE_REMOTE_ACCESS_DENIED:
case CURLE_BAD_CONTENT_ENCODING:
case CURLE_PEER_FAILED_VERIFICATION:
case CURLE_WEIRD_SERVER_REPLY:
case CURLE_BAD_DOWNLOAD_RESUME:
hostentry->failurecnt++;
hostentry->totalfailurecnt++;
case CURLE_RANGE_ERROR:
case CURLE_UNSUPPORTED_PROTOCOL:
case CURLE_AUTH_ERROR:
case CURLE_LOGIN_DENIED:
case CURLE_TOO_MANY_REDIRECTS:
case CURLE_FILESIZE_EXCEEDED:
case CURLE_HTTP2:
case CURLE_HTTP3:
case CURLE_HTTP2_STREAM:
case CURLE_QUIC_CONNECT_ERROR:
error("non-fatal error: %s (code %d)", curl_easy_strerror(trans_res), trans_res);
ret = TRANS_ERROR;
break;
case CURLE_SSL_CONNECT_ERROR:
case CURLE_COULDNT_RESOLVE_HOST:
case CURLE_COULDNT_CONNECT:
case CURLE_OPERATION_TIMEDOUT:
info("retrying (eventually)...: %s (code %d)", curl_easy_strerror(trans_res), trans_res);
// Add to to_crawl and remove from visited so that we retry the transfer (eventually)
deque_push_front(state->to_crawl, trans->url);
for (size_t i = 0; i < state->visited.len; i++) {
// Shallow compare is fine
if (state->visited.data[i] == trans->url) {
dynarr_remove(state->visited, i);
break;
}
}
hostentry->failurecnt++;
hostentry->totalfailurecnt++;
ret = TRANS_HOST_ERROR;
break;
default:
fatal("aborting...: %s (code %d)", curl_easy_strerror(trans_res), trans_res);
ret = TRANS_FATAL;
break;
}
return ret;
}
hostentry->failurecnt = 0;
// Link aggregation and parsing logic
writecb_data_t writecb_data = trans->cbdata.writecb_data;
char *page = (char*)writecb_data.base;
size_t npage = writecb_data.begin - writecb_data.base;
dynarr_init(charp_dynarr_t, *links);
debug("%ld links in %ld bytes ", links->len, npage);
pagecompletedata_t moduledata = {
.url = trans->url,
.handle = trans->handle,
.page = page,
.npage = npage,
.parsedlinks = links,
};
// Call callback and terminate, if provided
if (cb != NULL) {
dynarr_init(jsonkv_dynarr_t, *extrajson);
cb(cb_userdata, moduledata.url, moduledata.page, moduledata.npage, trans->handle);
return TRANS_OK;
}
// Module and JSON stuff
dynarr_init(extradata_dynarr_t, moduledata.extradata);
int rc;
for (size_t i = 0; i < state->modules.len; i++) {
if (state->modules.data[i]->module.onpagecomplete != NULL) {
rc = state->modules.data[i]->module.onpagecomplete(
state->modules.data[i]->module.userdata, &moduledata);
if (rc != 0)
error("module %s onpagecomplete failed with code %d", state->modules.data[i]->name, rc);
}
}
for (size_t i = 0; i < links->len; i++)
append2tocrawl(curl_url_h, links->data[i], state->config->allowedhosts, &state->to_crawl);
for (size_t i = state->modules.len - 1; i != SIZE_MAX; i--) {
if (state->modules.data[i]->module.onpagedestroy != NULL) {
int rc;
rc = state->modules.data[i]->module.onpagedestroy(
state->modules.data[i]->module.userdata, &moduledata);
if (rc != 0)
error("module %s onpagedestroy failed with code %d", state->modules.data[i]->name, rc);
}
}
jsonkv_dynarr_t *jsoncand =
searchextradata(EXTRA_JSON, "json", moduledata.extradata.data, moduledata.extradata.len);
dynarr_destroy(moduledata.extradata);
// TODO: Make it so we don't initialize an empty array on no extrajson
if (jsoncand == NULL)
dynarr_init(jsonkv_dynarr_t, *extrajson);
else
*extrajson = *jsoncand;
return TRANS_OK;
}
typedef struct {
char *url;
charp_dynarr_t links;
jsonkv_dynarr_t extrajson;
} linkentry_t;
dynarr_def(linkentry_t, linkentry_dynarr_t);
charp_dynarr_t getuniqlinks(linkentry_dynarr_t *links) {
// Filters for unique links
charp_hset_t uniqlinks_hset = hset_initi(charp_hset_t, hset_charp_hash, hset_charp_cmp);
for (size_t i = 0; i < links->len; i++)
for (size_t j = 0; j < links->data[i].links.len; j++)
hset_add(uniqlinks_hset, links->data[i].links.data[j]);
charp_dynarr_t uniqlinks = dynarr_initi(charp_dynarr_t);
void *saveptr = NULL;
char **link;
while ((link = hset_iter(uniqlinks_hset, saveptr)) != NULL)
dynarr_push(uniqlinks, *link);
hset_destroy(uniqlinks_hset);
return uniqlinks;
}
// Does some json stuff idek
jsonval_t links2json(charp_dynarr_t *uniqlinks, linkentry_dynarr_t *links, host_dynarr_t *hosts) {
jsonkv_t kvpair;
// Adds key/value pair for each host
jsonkv_dynarr_t hostmap = dynarr_initi(jsonkv_dynarr_t);
for (size_t i = 0; i < hosts->len; i++) {
hostentry_t *curhost = &hosts->data[i];
jsonkv_dynarr_t entrymap = dynarr_initi(jsonkv_dynarr_t);
bool isup = curhost->visitcnt > curhost->totalfailurecnt;
kvpair = (jsonkv_t){ .key = "up", .val = json_createbool(isup) };
dynarr_push(entrymap, kvpair);
kvpair = (jsonkv_t){ .key = "visitcnt", .val = json_createint(curhost->visitcnt) };
dynarr_push(entrymap, kvpair);
kvpair = (jsonkv_t){ .key = "failurecnt", .val = json_createint(curhost->totalfailurecnt) };
dynarr_push(entrymap, kvpair);
kvpair = (jsonkv_t){ .key = curhost->host, .val = json_createobj(entrymap) };
dynarr_push(hostmap, kvpair);
}
// Adds key/value pair for each visited link
jsonkv_dynarr_t linkmap = dynarr_initi(jsonkv_dynarr_t);
for (size_t i = 0; i < links->len; i++) {
linkentry_t *curentry = &links->data[i];
jsonkv_dynarr_t entrymap = dynarr_initi(jsonkv_dynarr_t);
jsonval_dynarr_t urlindicies = dynarr_initi(jsonval_dynarr_t);
for (size_t j = 0; j < curentry->links.len; j++) {
char *cururl = curentry->links.data[j];
size_t urlind;
for (urlind = 0; urlind < uniqlinks->len && strcmp(uniqlinks->data[urlind], cururl) != 0; urlind++)
;
if (urlind >= uniqlinks->len)
// Just a baby drinking coffee
die("You should never see this message unless i fucked the hash set implementation");
jsonval_t urlentry = json_createint(urlind);
dynarr_push(urlindicies, urlentry);
}
kvpair = (jsonkv_t){ .key = "link_indicies", .val = json_createarr(urlindicies) };
dynarr_push(entrymap, kvpair);
kvpair = (jsonkv_t){ .key = "nlinks", .val = json_createint(urlindicies.len) };
dynarr_push(entrymap, kvpair);
for (size_t i = 0; i < curentry->extrajson.len; i++)
dynarr_push(entrymap, curentry->extrajson.data[i]);
kvpair = (jsonkv_t){ .key = curentry->url, .val = json_createobj(entrymap) };
dynarr_push(linkmap, kvpair);
}
// Assembles object containing uniqlinks, hostmap, and linkmap
jsonkv_dynarr_t parentmap = dynarr_initi(jsonkv_dynarr_t);
jsonval_dynarr_t uniqlinks_json = dynarr_initi(jsonval_dynarr_t);
for (size_t i = 0; i < uniqlinks->len; i++)
dynarr_push(uniqlinks_json, json_createstr(uniqlinks->data[i]));
kvpair = (jsonkv_t) { .key = "hosts", .val = json_createobj(hostmap) };
dynarr_push(parentmap, kvpair);
kvpair = (jsonkv_t) { .key = "urlindicies", .val = json_createarr(uniqlinks_json) };
dynarr_push(parentmap, kvpair);
kvpair = (jsonkv_t) { .key = "links", .val = json_createobj(linkmap) };
dynarr_push(parentmap, kvpair);
return json_createobj(parentmap);
}
void crawler(const char *seed, const crawlerconfig_t *config) {
CURLU *curl_url_h = curl_url();
if (curl_url_h == NULL) {
fatal("CURL URL failed to initialize");
return;
}
if ((signal(SIGINT, fatalsighandler) == SIG_ERR)) {
//(signal(SIGSEGV, fatalsighandler) == SIG_ERR) ||
//(signal(SIGTERM, fatalsighandler) == SIG_ERR)) {
fatal("signal() failed: %s (code %d)", strerror(errno), errno);
return;
}
crawlerstate_t state = { 0 };
state.epollfd = -1; // For cleanup if error
deque_init(charp_deque_t, state.to_crawl);
dynarr_init(charp_dynarr_t, state.visited);
dynarr_init(host_dynarr_t, state.hostentries);
dynarr_init(int_dynarr_t, state.sockets);
state.config = config;
char *seed_buf = xmalloc(strlen(seed) + 1);
strcpy(seed_buf, seed);
deque_push_back(state.to_crawl, seed_buf);
struct epoll_event *event_buf = xmalloc(sizeof(struct epoll_event) * EPOLL_BUF_SIZE);
state.curl_multi_h = curl_multi_init();
if (state.curl_multi_h == NULL) {
fatal("CURL Multi failed to initialize");
goto cleanup;
}
CURLMcode mc;
if ((mc = curl_multi_setopt(state.curl_multi_h, CURLMOPT_SOCKETFUNCTION, socketcb)) ||
(mc = curl_multi_setopt(state.curl_multi_h, CURLMOPT_SOCKETDATA, &state)) ||
(mc = curl_multi_setopt(state.curl_multi_h, CURLMOPT_TIMERFUNCTION, timercb)) ||
(mc = curl_multi_setopt(state.curl_multi_h, CURLMOPT_TIMERDATA, &state.timeout_ms)) ||
(mc = curl_multi_setopt(state.curl_multi_h, CURLMOPT_MAX_HOST_CONNECTIONS, MAX_CONNECTIONS_PER_HOST)) ||
(mc = curl_multi_setopt(state.curl_multi_h, CURLMOPT_MAX_TOTAL_CONNECTIONS, MAX_CONNECTIONS))) {
fatal("curl_multi_setopt failed: %s (code %d)", curl_multi_strerror(mc), mc);
goto cleanup;
}
state.epollfd = epoll_create1(0);
if (state.epollfd < 0) {
fatal("epoll_create1 failed: %s (code %d)", strerror(errno), errno);
goto cleanup;
}
linkentry_dynarr_t linkentries = dynarr_initi(linkentry_dynarr_t);
state.modules = config->enabledmodules;
for (size_t i = 0; i < state.modules.len; i++) {
int (*init)(crawlermodule_t *) = state.modules.data[i]->module.init;
memset(&state.modules.data[i]->module, 0, sizeof(crawlermodule_t));
state.modules.data[i]->module.init = init;
state.modules.data[i]->module.init(&state.modules.data[i]->module);
}
size_t ntransfers = 0;
int running_handles = 0;
for(;;) {
if (shoulddie > 0) {
fatal("shoulddie = %d", shoulddie);
break;
}
transfer_t *transfer;
for (size_t i = 0; i < requestedreqs.len && ntransfers < MAX_CONNECTIONS; i++) {
requestedreq_t *request = &requestedreqs.data[i];
if (request->handle != NULL)
continue;
transfercode_t transcode = starttransfer(curl_url_h, &state,
(char*)request->url, &transfer, true);
if (transcode == TRANS_FATAL)
break;
else if (transcode != TRANS_OK)
continue;
transfer->cbdata.writecb_data.wasrequested = true;
request->handle = transfer->handle;
ntransfers++;
}
charp_dynarr_t stalled = dynarr_initi(charp_dynarr_t);
for (size_t i = 0; i < state.to_crawl.len && ntransfers < MAX_CONNECTIONS; i++) {
char *crawling = deque_pop_front(state.to_crawl);
bool passed = true;
for (size_t i = 0; i < state.modules.len; i++) {
moduleentry_t *entry = state.modules.data[i];
if (entry->module.filter == NULL)
continue;
filterres_t res = entry->module.filter(entry->module.userdata, crawling);
if (res != FILTER_PASS) {
passed = false;
char sanitized[100];
sanitize2ascii(sanitized, crawling, sizeof(sanitized));
if (res == FILTER_STALL) {
//debug("URL \"%s\" was stalled by module %s", sanitized, entry->name);
dynarr_push(stalled, crawling);
break;
}
else if (res == FILTER_REJECT) {
debug("URL \"%s\" was rejected by module %s", sanitized, entry->name);
// Don't add it to the visited list, in case the filter changes its mind
break;
}
}
}
if (!passed)
continue;
transfercode_t transcode = starttransfer(curl_url_h, &state, crawling, &transfer, false);
if (transcode == TRANS_FATAL)
break;
else if (transcode != TRANS_OK)
continue;
ntransfers++;
}
while (stalled.len > 0)
deque_push_back(state.to_crawl, dynarr_pop(stalled));
mc = perf_curl_multi_socket_action(state.curl_multi_h, -1, CURL_SOCKET_TIMEOUT, &running_handles);
if (mc) {
fatal("curl_multi_socket_action failed: %s (code %d)", curl_multi_strerror(mc), mc);
goto cleanup;
}
if (running_handles == 0)
continue;
// Main add/remove transfer loop
int availfds = perf_epoll_wait(state.epollfd, event_buf, EPOLL_BUF_SIZE, state.timeout_ms);
if (availfds < 0) {
fatal("epoll_wait failed: %s (code %d)", strerror(errno), errno);
break;
}
transfer_t *trans = state.headtransfer;
if (trans == NULL && state.to_crawl.len == 0) {
info("No more URLs left to crawl");
break;
}
// Tell CURL about any action connections
if (availfds > 0) {
for (int i = 0; i < availfds; i++) {
struct epoll_event *connevent = &event_buf[i];
int ev_bitmask = 0;
// TODO: Check for errors on the descriptor
if (connevent->events & EPOLLIN)
ev_bitmask |= CURL_CSELECT_IN;
if (connevent->events & EPOLLOUT)
ev_bitmask |= CURL_CSELECT_OUT;
CURLMcode mc;
mc = perf_curl_multi_socket_action(state.curl_multi_h, connevent->data.fd,
ev_bitmask, &running_handles);
if (mc) {
fatal("curl_multi_socket_action failed: %s (code %d)", curl_multi_strerror(mc), mc);
goto cleanup;
}
}
}
else {
CURLMcode mc;
mc = perf_curl_multi_socket_action(state.curl_multi_h, CURL_SOCKET_TIMEOUT, -1,
&running_handles);
if (mc) {
fatal("curl_multi_socket_action failed: %s (code %d)", curl_multi_strerror(mc), mc);
break;
}
}
// Process and prunes any finished connections
CURLMsg *msg;
int nmsgs;
while ((msg = curl_multi_info_read(state.curl_multi_h, &nmsgs)) != NULL){
// No other message types are currently defined, but in case they are
if (msg->msg != CURLMSG_DONE)
continue;
transfer_t *trans, *prev;
for (trans = state.headtransfer, prev = NULL;
trans != NULL;
prev = trans, trans = trans->next
)
if (trans->handle == msg->easy_handle)
break;
if (trans == NULL) {
fatal("message handle not found, handle=%p, result=%d",
(void*)msg->easy_handle, msg->data.result);
goto cleanup;
}
// Check if the request was requested
size_t requestind;
for (requestind = 0;
requestind < requestedreqs.len &&
requestedreqs.data[requestind].handle != trans->handle;
requestind++)
;
reqcb_t cb = NULL;
void *userdata = NULL;
if (requestind < requestedreqs.len) {
cb = requestedreqs.data[requestind].cb;
userdata = requestedreqs.data[requestind].userdata;
dynarr_remove(requestedreqs, requestind);
}
// Handle it and log that we visited it
charp_dynarr_t links;
jsonkv_dynarr_t extrajson;
transfercode_t transcode = transferfinished(curl_url_h, &state, msg, trans, &links,
&extrajson, cb, userdata);
if (cb == NULL && transcode == TRANS_OK) {
linkentry_t entry = { .url = trans->url, .links = links, .extrajson = extrajson };
dynarr_push(linkentries, entry);
}
// Remove it
if (prev == NULL)
// Transfer was the head
state.headtransfer = trans->next;
else
// Transfer was not the head
prev->next = trans->next;
prev = trans;
destroytransfer(state.curl_multi_h, trans);
if (transcode == TRANS_FATAL)
goto cleanup;
ntransfers--;
}
}
jsonval_t json;
charp_dynarr_t uniqlinks;
cleanup:
// Create and write out all of the json
uniqlinks = getuniqlinks(&linkentries);
json = links2json(&uniqlinks, &linkentries, &state.hostentries);
json_write(stdout, &json);
// Destroy all modules first bc they might have some save data
for (size_t i = 0; i < state.modules.len; i++)
if (state.modules.data[i]->module.destroy != NULL)
state.modules.data[i]->module.destroy(state.modules.data[i]->module.userdata);
// Destroy all links
json_destroy(&json);
for (size_t i = 0; i < linkentries.len; i++)
dynarr_destroy(linkentries.data[i].links);
dynarr_destroy(linkentries);
for (size_t i = 0; i < uniqlinks.len; i++)
xfree(uniqlinks.data[i]);
dynarr_destroy(uniqlinks);
// Destroy everything else
for (size_t i = 0; i < state.hostentries.len; i++) {
curl_slist_free_all(state.hostentries.data[i].headers);
curl_free((void*)state.hostentries.data[i].host);
}
for (transfer_t *trans = state.headtransfer, *next; trans != NULL; trans = next) {
next = trans->next;
destroytransfer(state.curl_multi_h, trans);
}
if (state.curl_multi_h != NULL)
curl_multi_cleanup(state.curl_multi_h);
if (state.epollfd >= 0)
close(state.epollfd);
xfree(event_buf);
curl_url_cleanup(curl_url_h);
dynarr_destroy(state.visited);
dynarr_destroy(state.hostentries);
dynarr_destroy(state.sockets);
dynarr_destroy(requestedreqs);
deque_destroy(state.to_crawl);
}

152
src/http.c Normal file
View File

@ -0,0 +1,152 @@
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>
#include <curl/curl.h>
#include "http.h"
#include "util.h"
#define INIT_PAGE_SIZE 8192
#define MAX_PAGE_SIZE 1048576
#define MEM_INC_FACTOR 2
#define TIMEOUT_MS 10000
#define CONNECT_TIMEOUT_MS 3000
const char *useragents[] = {
"AvaBot",
NULL,
};
size_t bufwritecb(const byte *ptr, size_t size, size_t nmemb, writecb_data_t *userdata) {
for (size_t i = 0; i < userdata->modules->len; i++) {
if (userdata->modules->data[i]->module.onpagewrite != NULL) {
int rc;
rc = userdata->modules->data[i]->module.onpagewrite(
userdata->modules->data[i]->module.userdata, userdata->url, ptr, nmemb);
if (rc != 0)
error("module %s onpagewrite failed with code %d", userdata->modules->data[i]->name);
}
}
size_t len = nmemb;
while (userdata->begin + nmemb + 1 > userdata->end) {
// Buffer is undersized
size_t buf_len = userdata->end - userdata->base;
if (buf_len >= MAX_PAGE_SIZE)
break;
size_t new_buf_len = buf_len * MEM_INC_FACTOR;
if (new_buf_len > MAX_PAGE_SIZE)
new_buf_len = MAX_PAGE_SIZE;
byte *new_base = xrealloc(userdata->base, new_buf_len);
userdata->end = new_base + new_buf_len;
userdata->begin = new_base + (userdata->begin - userdata->base);
userdata->base = new_base;
}
if (userdata->begin + nmemb + 1 > userdata->end)
// Buffer is still undersized
len = userdata->end - userdata->begin;
memcpy(userdata->begin, ptr, len);
userdata->begin[len] = '\0';
userdata->begin += len;
return len;
}
bool is_redirect(int status) {
return status == 301 || // Moved Permanently
status == 302 || // Found
status == 307 || // Temporary Redirect
status == 308; // Permanent Redirect
}
size_t headerwritecb(const char *buffer, size_t _size, size_t nitems, headercb_data_t *userdata) {
const char content_type_str[] = "content-type:", html_mime_str[] = "text/html", http_str[] = "HTTP/";
// Parses HTTP status line
if (nitems < sizeof(http_str) - 1)
return nitems;
if (memcmp(buffer, http_str, sizeof(http_str) - 1) == 0) {
// Header is an http status line
userdata->num_requests++;
const char *status_line = memchr(buffer, ' ', nitems);
for (; *status_line == ' '; status_line++) {
// Ensures that status_line...(buffer+nitems) can fit a status code (3 numbers)
if (status_line > buffer + nitems - 3) {
userdata->status = 0;
return CURL_WRITEFUNC_ERROR;
}
}
char code_str[4] = { status_line[0], status_line[1], status_line[2], '\0' };
userdata->status = atoi(code_str);
if (userdata->status == 200 || is_redirect(userdata->status))
return nitems;
return CURL_WRITEFUNC_ERROR;
}
if (userdata->status == 0 || is_redirect(userdata->status))
return nitems;
// Parses Content-Type header
if (userdata->flags & HEADERCB_CONTENT_TYPE_ENCOUNTERED)
return nitems;
// We need an extra byte to plop header_val to the byte after the ':'
if (nitems < sizeof(content_type_str))
return nitems;
for (size_t i = 0; i < sizeof(content_type_str)-1; i++)
if (tolower(buffer[i]) != content_type_str[i])
return nitems;
const char *header_val = buffer + sizeof(content_type_str);
for (; *header_val == ' '; header_val++)
// Ensures that header_val..(buffer+nitems) can fit "text/html"
if (header_val > buffer + nitems - sizeof(html_mime_str) + 1)
return nitems;
userdata->flags |= HEADERCB_CONTENT_TYPE_ENCOUNTERED;
if (memcmp(header_val, html_mime_str, sizeof(html_mime_str)-1) != 0)
return CURL_WRITEFUNC_ERROR;
userdata->flags |= HEADERCB_VALID_MIME;
return nitems;
}
void initcbdata(const char *url, moduleentryp_dynarr_t *modules, cbdata_t *data) {
memset(data, 0, sizeof(cbdata_t));
data->writecb_data.base = xmalloc(INIT_PAGE_SIZE);
data->writecb_data.begin = data->writecb_data.base;
data->writecb_data.end = data->writecb_data.base + INIT_PAGE_SIZE;
data->writecb_data.url = url;
data->writecb_data.modules = modules;
}
CURL *makehandle(const char *url, hostentry_t *host_entry, cbdata_t *cbdata, bool wasrequested) {
CURL *curl_h = curl_easy_init();
CURLcode easy_res;
if (curl_h == NULL) {
error("curl failed to initialize\n");
return NULL;
}
if (/* 1MiB/s max send speed */
(easy_res = curl_easy_setopt(curl_h, CURLOPT_MAX_SEND_SPEED_LARGE, 1024 * 1024)) != CURLE_OK ||
/* 1MiB/s max recv speed */
(easy_res = curl_easy_setopt(curl_h, CURLOPT_MAX_RECV_SPEED_LARGE, 1024 * 1024)) != CURLE_OK ||
(easy_res = curl_easy_setopt(curl_h, CURLOPT_TIMEOUT_MS, TIMEOUT_MS)) != CURLE_OK ||
(easy_res = curl_easy_setopt(curl_h, CURLOPT_CONNECTTIMEOUT_MS, CONNECT_TIMEOUT_MS)) != CURLE_OK ||
(easy_res = curl_easy_setopt(curl_h, CURLOPT_FOLLOWLOCATION, 1)) != CURLE_OK ||
(easy_res = curl_easy_setopt(curl_h, CURLOPT_MAXREDIRS, 3)) != CURLE_OK ||
(easy_res = curl_easy_setopt(curl_h, CURLOPT_HTTPHEADER, host_entry->headers)) != CURLE_OK ||
(easy_res = curl_easy_setopt(curl_h, CURLOPT_PROTOCOLS_STR, "http,https")) != CURLE_OK ||
(easy_res = curl_easy_setopt(curl_h, CURLOPT_WRITEFUNCTION, bufwritecb)) != CURLE_OK ||
(easy_res = curl_easy_setopt(curl_h, CURLOPT_WRITEDATA, &cbdata->writecb_data)) != CURLE_OK ||
(easy_res = curl_easy_setopt(curl_h, CURLOPT_URL, url) != CURLE_OK)) {
error("curl setopt failed: %s (code %d)\n", curl_easy_strerror(easy_res), easy_res);
curl_easy_cleanup(curl_h);
return NULL;
}
if (!wasrequested &&
((easy_res = curl_easy_setopt(curl_h, CURLOPT_HEADERFUNCTION, headerwritecb)) != CURLE_OK ||
(easy_res = curl_easy_setopt(curl_h, CURLOPT_HEADERDATA, &cbdata->headercb_data)) != CURLE_OK)) {
error("curl setopt failed: %s (code %d)\n", curl_easy_strerror(easy_res), easy_res);
curl_easy_cleanup(curl_h);
return NULL;
}
return curl_h;
}

88
src/json.c Normal file
View File

@ -0,0 +1,88 @@
#include <stdbool.h>
#include <stdio.h>
#include "json.h"
jsonval_t json_createobj(jsonkv_dynarr_t pairs) {
jsonkv_dynarr_t *stolen = xmalloc(sizeof(jsonkv_dynarr_t));
*stolen = pairs;
return (jsonval_t){ .type = JSON_OBJECT, .data = stolen };
}
jsonval_t json_createarr(jsonval_dynarr_t elems) {
jsonval_dynarr_t *stolen = xmalloc(sizeof(jsonval_dynarr_t));
*stolen = elems;
return (jsonval_t){ .type = JSON_ARRAY, .data = stolen };
}
jsonval_t json_createstr(const char *str) {
return (jsonval_t){ .type = JSON_STRING, .data = (void*)str };
}
jsonval_t json_createint(long num) {
return (jsonval_t){ .type = JSON_INT, .data = (void*)num };
}
jsonval_t json_createbool(bool val) {
return (jsonval_t){ .type = JSON_BOOL, .data = (void*)val };
}
jsonval_t json_createnull(void) {
return (jsonval_t){ .type = JSON_NULL };
}
void json_destroy(jsonval_t *val) {
if (val->type == JSON_OBJECT) {
jsonkv_dynarr_t *arr = (jsonkv_dynarr_t*)val->data;
for (size_t i = 0; i < arr->len; i++)
json_destroy(&arr->data[i].val);
dynarr_destroy(*arr);
xfree(arr);
}
else if (val->type == JSON_ARRAY) {
jsonval_dynarr_t *arr = (jsonval_dynarr_t*)val->data;
for (size_t i = 0; i < arr->len; i++)
json_destroy(&arr->data[i]);
dynarr_destroy(*arr);
xfree(arr);
}
}
void json_write(FILE *out, jsonval_t *val) {
switch(val->type) {
case JSON_OBJECT:
fprintf(out, "{");
jsonkv_dynarr_t *map = (jsonkv_dynarr_t*)val->data;
for (size_t i = 0; i < map->len; i++) {
jsonkv_t *pair = &map->data[i];
fprintf(out, "\"%s\":", pair->key);
json_write(out, &pair->val);
if (i != map->len - 1)
fprintf(out, ",");
}
fprintf(out, "}");
break;
case JSON_ARRAY:
fprintf(out, "[");
jsonval_dynarr_t *arr = (jsonval_dynarr_t*)val->data;
for (size_t i = 0; i < arr->len; i++) {
json_write(out, &arr->data[i]);
if (i != arr->len - 1)
fprintf(out, ",");
}
fprintf(out, "]");
break;
case JSON_STRING:
fprintf(out, "\"%s\"", (const char*)val->data);
break;
case JSON_INT:
fprintf(out, "%ld", (long)val->data);
break;
case JSON_BOOL:
fprintf(out, "%s", val->data ? "true" : "false");
break;
case JSON_NULL:
fprintf(out, "null");
break;
}
}

39
src/main.c Normal file
View File

@ -0,0 +1,39 @@
#include <stdio.h>
#include <stdlib.h>
#include <curl/curl.h>
#include <sys/random.h>
#include "crawler.h"
#include "module.h"
#include "util.h"
const char *allowed_hosts[] = {
"32bit.cafe",
//"en.wikipedia.org",
NULL
};
int main(int argc, char **argv) {
if (argc != 2) {
fprintf(stderr, "url dumbass\n");
return 1;
}
int seed, rc;
if ((rc = getrandom(&seed, sizeof(seed), 0)) != sizeof(seed)) {
fatal("getrandom() failed with %d", rc);
return 1;
}
srandom(seed);
crawlerconfig_t config = { .allowedhosts = allowed_hosts, .req_interval_s = 0 };
curl_global_init(CURL_GLOBAL_DEFAULT);
dynarr_init(moduleentryp_dynarr_t, config.enabledmodules);
for (moduleentry_t *module = availmodules; module->name != NULL; module++)
dynarr_push(config.enabledmodules, module);
crawler(argv[1], &config);
dynarr_destroy(config.enabledmodules);
curl_global_cleanup();
return 0;
}

56
src/mod_debug.c Normal file
View File

@ -0,0 +1,56 @@
#include <tidy.h>
#include <tidybuffio.h>
#include "util.h"
#include "module.h"
/* Traverse the document tree */
void dumpNode(TidyDoc doc, TidyNode tnod, int indent)
{
TidyNode child;
for(child = tidyGetChild(tnod); child; child = tidyGetNext(child) ) {
ctmbstr name = tidyNodeGetName(child);
if(name) {
/* if it has a name, then it's an HTML tag ... */
TidyAttr attr;
fprintf(stderr, "%*.*s%s ", indent, indent, "<", name);
/* walk the attribute list */
for(attr = tidyAttrFirst(child); attr; attr = tidyAttrNext(attr) ) {
fprintf(stderr, "%s", tidyAttrName(attr));
tidyAttrValue(attr)?fprintf(stderr, "=\"%s\" ",
tidyAttrValue(attr)):fprintf(stderr, " ");
}
fprintf(stderr, ">\n");
}
else {
/* if it does not have a name, then it's probably text, cdata, etc... */
TidyBuffer buf;
tidyBufInit(&buf);
tidyNodeGetText(doc, child, &buf);
fprintf(stderr, "%*.*s%s\n", indent, indent, "", buf.bp?(char *)buf.bp:"");
tidyBufFree(&buf);
}
dumpNode(doc, child, indent + 4); /* recursive */
}
}
int mod_debug_onpagecomplete(void *userdata, pagecompletedata_t *data) {
fprintf(stderr, "\n-- HTML for %s\n\n", data->url);
fwrite(data->page, 1, data->npage, stderr);
fprintf(stderr, "\n\n-- *CLEANED* HTML for %s\n\n", data->url);
TidyDoc doc = searchextradata(EXTRA_TIDY, "tidyDoc", data->extradata.data, data->extradata.len);
if (doc == NULL) {
error("\"tidyDoc\" entry not found. either mod_tidy failed or is not loaded before");
return -1;
}
dumpNode(doc, tidyGetRoot(doc), 0);
return 0;
}
int mod_debug_init(crawlermodule_t *entry) {
*entry = (crawlermodule_t) {
.init = entry->init,
.onpagecomplete = mod_debug_onpagecomplete,
};
return 0;
}

149
src/mod_pagedata.c Normal file
View File

@ -0,0 +1,149 @@
#include <curl/curl.h>
#include <tidybuffio.h>
#include <tidy.h>
#include "json.h"
#include "module.h"
#include "util.h"
#define MAX_HEADER_SIZE 256
#define MAX_TITLE_SIZE 256
bool getescapedheader(CURL *handle, const char *header, char **escaped) {
struct curl_header *data;
*escaped = NULL;
CURLHcode res = curl_easy_header(handle, header, 0, CURLH_HEADER, -1, &data);
if (res != CURLHE_OK) {
if (res == CURLHE_BADINDEX || res == CURLHE_NOREQUEST ||
res == CURLHE_NOREQUEST || res == CURLHE_MISSING)
return true;
error("curl_easy_handle() failed with code %d", res);
return false;
}
int len = strlen(data->value);
if (len > MAX_HEADER_SIZE) {
error("max header size of %d bytes exceeded. header size is %d bytes", MAX_HEADER_SIZE, len);
return false;
}
*escaped = sanitize2ascii_dyn(data->value, MAX_HEADER_SIZE * 4);
if (escaped == NULL) {
error("sanitize2ascii_dyn() failed");
return false;
}
return true;
}
char *gettitle(TidyDoc doc, TidyNode node) {
for (TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) {
ctmbstr name = tidyNodeGetName(child);
if (!name)
return NULL;
if (strcmp(name, "title") == 0) {
TidyNode textchild = tidyGetChild(child); // If conforming, should be text
TidyBuffer buf;
tidyBufInit(&buf);
if (!tidyNodeGetText(doc, textchild, &buf)) {
tidyBufFree(&buf);
continue;
}
size_t len = strlen((char*)buf.bp);
// tidy places a newline at the end of a title, so we have to be careful to get rid
// of it
if (len <= 1) {
tidyBufFree(&buf);
continue;
}
char *ret = xmalloc(len+1);
strcpy(ret, (char*)buf.bp);
ret[len-1] = '\0';
tidyBufFree(&buf);
return ret;
}
char *ret = gettitle(doc, child);
if (ret != NULL)
return ret;
}
return NULL;
}
void dumpnode(TidyDoc doc, charp_dynarr_t *freearr, jsonkv_dynarr_t *json) {
TidyNode head = tidyGetHead(doc);
if (head) {
char *title = gettitle(doc, head);
if (title != NULL) {
int len = strlen(title);
if (len > MAX_TITLE_SIZE) {
error("max title size of %d bytes exceeded. title size is %d bytes", MAX_TITLE_SIZE, len);
}
else {
char *escaped = sanitize2ascii_dyn(title, MAX_TITLE_SIZE * 4);
if (escaped == NULL) {
error("sanitize2ascii_dyn() failed");
xfree(escaped);
}
jsonkv_t kv = { .key = "title", .val = json_createstr(escaped) };
dynarr_push(*json, kv);
dynarr_push(*freearr, escaped);
}
xfree(title);
}
}
}
int mod_pagedata_onpagecomplete(void *userdata, pagecompletedata_t *data) {
jsonkv_dynarr_t *jsonarr =
searchextradata(EXTRA_JSON, "json", data->extradata.data, data->extradata.len);
if (jsonarr == NULL) {
jsonkv_dynarr_t *arr = xmalloc(sizeof(jsonkv_dynarr_t));
dynarr_init(jsonkv_dynarr_t, *arr);
extradata_t entry = {
.type = EXTRA_JSON,
.key = "json",
.val = arr,
};
dynarr_push(data->extradata, entry);
jsonarr = (jsonkv_dynarr_t*)dynarr_get(data->extradata, data->extradata.len-1)->val;
}
charp_dynarr_t *freearr = userdata;
char *headernames[] = {"Content-Type", "Last-Modified", "ETag"};
for (size_t i = 0; i < array_size(headernames); i++) {
char *escaped;
if (!getescapedheader(data->handle, headernames[i], &escaped)) {
error("appendheader() failed with header %s", headernames[i]);
continue;
}
if (escaped == NULL)
continue;
dynarr_push(*freearr, escaped);
jsonkv_t kv = { .key = headernames[i], .val = json_createstr(escaped) };
dynarr_push(*jsonarr, kv);
}
TidyDoc doc = searchextradata(EXTRA_TIDY, "tidyDoc", data->extradata.data, data->extradata.len);
if (doc == NULL)
debug("url %s contained no tidy data, skipping document parsing...", data->url);
else
dumpnode(doc, freearr, jsonarr);
return 0;
}
int mod_pagedata_destroy(void *userdata) {
charp_dynarr_t *freearr = userdata;
for (size_t i = 0; i < freearr->len; i++)
xfree(freearr->data[i]);
dynarr_destroy(*freearr);
xfree(freearr);
return 0;
}
int mod_pagedata_init(crawlermodule_t *entry) {
*entry = (crawlermodule_t) {
.userdata = xmalloc(sizeof(charp_dynarr_t)),
.init = entry->init,
.destroy = mod_pagedata_destroy,
.onpagecomplete = mod_pagedata_onpagecomplete,
};
*(charp_dynarr_t*)entry->userdata = dynarr_initi(charp_dynarr_t);
return 0;
}

214
src/mod_parse.c Normal file
View File

@ -0,0 +1,214 @@
#include <curl/curl.h>
#include <ctype.h>
#include <tidybuffio.h>
#include <tidy.h>
#include "module.h"
#include "util.h"
#define MAX_LINK_LEN 512
bool islinksafe(char c) {
return ('a' <= c && c <= 'z') ||
('A' <= c && c <= 'Z') ||
('0' <= c && c <= '9') ||
c == '&' || c == '$' || c == ',' || c == '/' ||
c == ':' || c == ';' || c == '=' || c == '?' ||
c == '@' || c == '#' || c == '%' || c == '~' ||
c == '_' || c == '-' || c == '(' || c == ')' ||
c == '.';
}
char *relative2absolute(CURLU *curl_url_h, const char *parent, const char *relative) {
CURLUcode url_res;
char *curl_abs_link = NULL;
url_res = curl_url_set(curl_url_h, CURLUPART_URL, parent, 0);
if (url_res != CURLUE_OK) {
char sanitized[100];
sanitize2ascii(sanitized, relative, sizeof(sanitized));
error("Parent URL parsing failed for \"%s\": %s",
sanitized, curl_url_strerror(url_res));
return NULL;
}
url_res = curl_url_set(curl_url_h, CURLUPART_URL, relative, 0);
if (url_res != CURLUE_OK){
char sanitized[100];
sanitize2ascii(sanitized, relative, sizeof(sanitized));
error("Relative URL parsing failed for \"%s\": %s",
sanitized, curl_url_strerror(url_res));
return NULL;
}
url_res = curl_url_get(curl_url_h, CURLUPART_URL, &curl_abs_link, CURLU_PUNYCODE);
if (url_res != CURLUE_OK) {
error("Full URL parsing failed: %s", curl_url_strerror(url_res));
return NULL;
}
char *ret = xmalloc(strlen(curl_abs_link) + 1);
strcpy(ret, curl_abs_link);
curl_free(curl_abs_link);
return ret;
}
size_t geturlcutlen(const char *url, size_t len) {
char *hash = memchr(url, '#', len);
size_t hlen = hash == NULL ? len : hash - url;
char *ques = memchr(url, '?', len);
size_t qlen = ques == NULL ? len : ques - url;
return min(hlen, qlen);
}
char *parselink(CURLU *curl_url_h, const char *parent, const char *child, size_t nchild) {
size_t cutlen = geturlcutlen(child, nchild);
if (cutlen < 1)
return NULL;
char *linkbuf = xmalloc(cutlen + 1);
memcpy(linkbuf, child, cutlen);
linkbuf[cutlen] = '\0';
char *abslink = relative2absolute(curl_url_h, parent, linkbuf);
if (abslink != NULL) {
char sanitized[100];
sanitize2ascii(sanitized, abslink, sizeof(sanitized));
debug("found link: %s", sanitized);
}
xfree(linkbuf);
return abslink;
}
int parsehrefs(CURLU *curl_url_h, const char *url, const char *page, size_t npage, charp_dynarr_t *ret) {
const char *href = "href";
size_t href_len = strlen(href), j = 0, linklen = 0;
int state = 0, linkcnt = 0;
// Probably a really good job for regex but wtvr
for (size_t i = 0; i < npage; i++) {
if (state == 0) {
if (j == href_len) {
state += 1;
i -= 1;
}
if (tolower(page[i]) != href[j])
j = 0;
}
else if (state == 1 && page[i] == '=') {
state += 1;
}
else if (state == 2 && page[i] == '"') {
state += 1;
linklen = 0;
}
else if (state == 3) {
if (page[i] == '"') {
state = 0;
if (linklen > 0) {
char *link = parselink(curl_url_h, url, page + i - linklen, linklen);
if (link != NULL) {
dynarr_push(*ret, link);
linkcnt++;
}
}
}
if (!islinksafe(page[i]) || linklen >= MAX_LINK_LEN)
state = 0;
linklen += 1;
}
else if (page[i] != ' ' && page[i] != '\t' && page[i] != '\n') {
state = 0;
}
j++;
}
return linkcnt;
}
char *tolower_s(const char *in) {
size_t len = strlen(in);
char *ret = xmalloc(len+1);
for (size_t i = 0; i < len; i++)
ret[i] = tolower(in[i]);
ret[len] = '\0';
return ret;
}
int parseattrs(CURLU *curl_url_h, const char *url, TidyNode node, charp_dynarr_t *links) {
ctmbstr name = tidyNodeGetName(node);
const char *texttags[] = { "p", "t", "span", "a" };
char *namelower = tolower_s(name);
bool found = false;
for (size_t i = 0; !found && i < array_size(texttags); i++)
if (strcmp(namelower, texttags[i]) == 0)
found = true;
xfree(namelower);
if (!found)
return 0;
// Parse attributes for links
int linkcnt = 0;
for (TidyAttr attr = tidyAttrFirst(node); attr; attr = tidyAttrNext(attr)) {
ctmbstr attrname = tidyAttrName(attr);
if (attrname == NULL) {
char sanitized[100];
sanitize2ascii(sanitized, url, sizeof(sanitized));
error("empty attr name for url \"%s\"", sanitized);
continue;
}
char *attrnamelower = tolower_s(attrname);
ctmbstr val = tidyAttrValue(attr);
if (strcmp(attrnamelower, "href") != 0 || val == NULL) {
xfree(attrnamelower);
continue;
}
xfree(attrnamelower);
// extract the link
char *link = parselink(curl_url_h, url, val, strlen(val));
if (link == NULL)
continue;
dynarr_push(*links, link);
linkcnt++;
}
return linkcnt;
}
int parsenode(CURLU *curl_url_h, const char *url, TidyNode node, charp_dynarr_t *links) {
int linkcnt = 0;
for (TidyNode child = tidyGetChild(node); child; child = tidyGetNext(child)) {
ctmbstr name = tidyNodeGetName(child);
if (name == NULL) {
// Node is probably text, for now do nothing
// TODO: Parse plain text links
}
else {
linkcnt += parseattrs(curl_url_h, url, child, links);
}
linkcnt += parsenode(curl_url_h, url, child, links);
}
return linkcnt;
}
int mod_parse_onpagecomplete(void *userdata, pagecompletedata_t *data) {
CURLU *curl_url_h = userdata;
TidyDoc doc = searchextradata(EXTRA_TIDY, "tidyDoc", data->extradata.data, data->extradata.len);
if (doc == NULL) {
debug("No tidied document found. Parsing hrefs...");
parsehrefs(curl_url_h, data->url, data->page, data->npage, data->parsedlinks);
}
else {
TidyNode body = tidyGetBody(doc);
if (!body)
return 0;
parsenode(curl_url_h, data->url, body, data->parsedlinks);
}
return 0;
}
int mod_parse_destroy(void *userdata) {
CURLU *curl_url_h = userdata;
curl_url_cleanup(curl_url_h);
return 0;
}
int mod_parse_init(crawlermodule_t *entry) {
*entry = (crawlermodule_t) {
.init = entry->init,
.onpagecomplete = mod_parse_onpagecomplete,
.destroy = mod_parse_destroy,
.userdata = curl_url(),
};
return 0;
}

235
src/mod_robots.c Normal file
View File

@ -0,0 +1,235 @@
#include <string.h>
#include <stdlib.h>
#include <curl/curl.h>
#include <time.h>
#include <errno.h>
#include "module.h"
#include "util.h"
typedef struct hostdata {
char *host;
time_t crawldelay_ms;
charp_dynarr_t prefixes;
struct hostdata *next;
struct timespec lastcrawled;
} hostdata_t;
typedef struct {
hostdata_t *rules;
CURLU *curl_url_h;
} state_t;
bool isprefixed(const char *str, const char *prefix) {
size_t prefixlen = strlen(prefix);
return prefixlen > strlen(str) ? false : memcmp(str, prefix, prefixlen) == 0;
}
void parse_robots_txt(hostdata_t *rules, char *page) {
for (char *ptr = page; *ptr != '\0'; ptr++)
if (*ptr == '\t')
*ptr = ' ';
char *linesave, *line;
bool forus = true;
// page is guarenteed to be terminated by a null byte
while ((line = strtok_r(page, "\n", &linesave)) != NULL) {
page = NULL;
char *comment;
if ((comment = memchr(line, '#', strlen(line))) != NULL)
*comment = '\0';
char *wssave, *func;
if ((func = strtok_r(line, " ", &wssave)) == NULL)
continue;
char *tmp = xmalloc(strlen(func)+1);
strcpy(tmp, func);
func = tmp;
char *arg;
if ((arg = strtok_r(NULL, " ", &wssave)) == NULL)
continue;
if (strcmp(func, "User-agent:") == 0) {
forus = strcmp(arg, "*") == 0 || isprefixed(arg, "AvaBot");
}
else if (strcmp(func, "Disallow:") == 0) {
if (!forus) {
xfree(func);
continue;
}
if (strcmp(arg, "*") == 0) {
dynarr_push(rules->prefixes, "/");
}
else {
char *buf = xmalloc(strlen(arg)+1);
strcpy(buf, arg);
dynarr_push(rules->prefixes, buf);
}
}
else if (strcmp(func, "Crawl-delay:") == 0) {
if (!forus) {
xfree(func);
continue;
}
rules->crawldelay_ms = (unsigned long)(1000.0*atof(arg))/1000;
}
xfree(func);
}
}
void robots_txt_cb(void *cbdata, const char *url, char *page, size_t npage, CURL *handle) {
state_t *state = cbdata;
hostdata_t *rules = state->rules;
dynarr_init(charp_dynarr_t, rules->prefixes);
rules->crawldelay_ms = 100;
char *curl_path;
CURLUcode rc;
if ((rc = curl_url_set(state->curl_url_h, CURLUPART_URL, url, 0)) != CURLUE_OK ||
(rc = curl_url_get(state->curl_url_h, CURLUPART_PATH, &curl_path, 0)) != CURLUE_OK) {
error("curl_url operation failed: %s (code %d)", curl_url_strerror(rc), rc);
return;
}
char *ctype;
CURLcode ec;
if ((ec = curl_easy_getinfo(handle, CURLINFO_CONTENT_TYPE, &ctype)) != CURLE_OK) {
error("curl_easy_getinfo failed: %s (code %d)", curl_easy_strerror(ec), ec);
return;
}
bool ctypegood = ctype == NULL ? false : isprefixed(ctype, "text/plain");
if (!ctypegood || page == NULL) {
if (strcmp(curl_path, "robots.txt") == 0) {
char *curl_newurl;
const char *newpath = ".well-known/robots.txt";
if ((rc = curl_url_set(state->curl_url_h, CURLUPART_PATH, newpath, 0)) != CURLUE_OK ||
(rc = curl_url_get(state->curl_url_h, CURLUPART_URL, &curl_newurl, 0)) != CURLUE_OK) {
error("curl_url operation failed: %s (code %d)", curl_url_strerror(rc), rc);
curl_free(curl_path);
return;
}
char *newurl = xmalloc(strlen(curl_newurl)+1);
strcpy(newurl, curl_newurl);
curl_free(curl_newurl);
makerequest(newurl, robots_txt_cb, cbdata);
}
return;
}
parse_robots_txt(rules, page);
xfree(state); // state is cb owned
}
filterres_t mod_robots_filter(void *userdata, const char *url) {
state_t *state = userdata;
char *curl_host, *curl_path;
CURLUcode rc;
if ((rc = curl_url_set(state->curl_url_h, CURLUPART_URL, url, 0) != CURLUE_OK) ||
(rc = curl_url_get(state->curl_url_h, CURLUPART_HOST, &curl_host, 0) != CURLUE_OK) ||
(rc = curl_url_get(state->curl_url_h, CURLUPART_PATH, &curl_path, 0) != CURLUE_OK)) {
char sanitized[100];
sanitize2ascii(sanitized, url, sizeof(sanitized));
error("curl_url operation failed for \"%s\": %s (code %d)", sanitized,
curl_url_strerror(rc), rc);
return FILTER_PASS;
}
if (curl_host == NULL) {
error("curl_host == NULL");
return FILTER_PASS;
}
hostdata_t *rules = state->rules, *prev = NULL;
for (; rules != NULL &&
strcmp(rules->host, curl_host) != 0;
prev = rules, rules = rules->next)
;
filterres_t ret;
if (rules == NULL) {
char *curl_url;
if ((rc = curl_url_set(state->curl_url_h, CURLUPART_PATH, "robots.txt", 0) != CURLUE_OK) ||
(rc = curl_url_get(state->curl_url_h, CURLUPART_URL, &curl_url, 0) != CURLUE_OK)) {
char sanitized[100];
sanitize2ascii(sanitized, url, sizeof(sanitized));
error("curl_url operation failed for \"%s\": %s (code %d)", sanitized,
curl_url_strerror(rc), rc);
ret = FILTER_PASS;
goto cleanup;
}
if (curl_url == NULL) {
error("curl_url == NULL");
ret = FILTER_PASS;
goto cleanup;
}
char *host = xmalloc(strlen(curl_host)+1);
strcpy(host, curl_host);
hostdata_t *newrules = xmalloc(sizeof(hostdata_t));
*newrules = (hostdata_t) { .host = host, 0 };
if (prev == NULL)
state->rules = newrules;
else
prev->next = newrules;
char *url = xmalloc(strlen(curl_url)+1);
strcpy(url, curl_url);
state_t *cbdata = xmalloc(sizeof(state_t));
cbdata->rules = newrules;
cbdata->curl_url_h = state->curl_url_h;
debug("cbdata = %p", cbdata);
makerequest(url, robots_txt_cb, cbdata);
curl_free(curl_url);
ret = FILTER_STALL;
}
else {
if (rules->prefixes.data == NULL) {
// robots.txt request hasn't finished
ret = FILTER_STALL;
goto cleanup;
}
struct timespec now;
if (clock_gettime(CLOCK_MONOTONIC, &now) < 0) {
char *err = strerror(errno);
error("clock_gettime failed: %s (code %d)", err, errno);
ret = FILTER_STALL;
goto cleanup;
}
time_t diff_ms = now.tv_sec * 1000 + now.tv_nsec / 1000000 -
rules->lastcrawled.tv_sec * 1000 - rules->lastcrawled.tv_nsec / 1000000;
if (diff_ms < rules->crawldelay_ms) {
ret = FILTER_STALL;
goto cleanup;
}
rules->lastcrawled = now;
ret = FILTER_PASS;
for (size_t i = 0; i < rules->prefixes.len; i++) {
if (isprefixed(curl_path, rules->prefixes.data[i])) {
ret = FILTER_REJECT;
break;
}
}
}
cleanup:
curl_free(curl_path);
curl_free(curl_host);
return ret;
}
int mod_robots_destroy(void *userdata) {
state_t *state = userdata;
curl_url_cleanup(state->curl_url_h);
hostdata_t *cur = state->rules, *next;
for (; cur != NULL; cur = next) {
next = cur->next;
dynarr_destroy(cur->prefixes);
xfree(cur);
}
xfree(state);
return 0;
}
int mod_robots_init(crawlermodule_t *entry) {
state_t *state = xmalloc(sizeof(state_t));
*state = (state_t) {
.rules = NULL,
.curl_url_h = curl_url(),
};
*entry = (crawlermodule_t) {
.userdata = state,
.init = entry->init,
.filter = mod_robots_filter,
.destroy = mod_robots_destroy,
};
return 0;
}

71
src/mod_tidy.c Normal file
View File

@ -0,0 +1,71 @@
#include <tidy.h>
#include <tidybuffio.h>
#include "util.h"
#include "module.h"
int repairdoc(TidyDoc doc, TidyBuffer *docbuf) {
int rc;
rc = tidyParseBuffer(doc, docbuf);
if (rc < 0) {
error("tidyParseBuffer() returned code %d", rc);
return rc;
}
rc = tidyCleanAndRepair(doc);
if (rc < 0) {
error("tidyCleanAndRepair() returned code %d", rc);
return rc;
}
rc = tidyRunDiagnostics(doc);
if (rc < 0)
error("tidyRunDiagnostics() returned code %d", rc);
return rc;
}
int mod_tidy_onpagecomplete(void *userdata, pagecompletedata_t *data) {
if (searchextradata(EXTRA_TIDY, "tidyDoc", data->extradata.data, data->extradata.len) != NULL)
return -1;
TidyBuffer origbuf = {0}, errbuf = {0};
tidyBufAttach(&origbuf, (byte*)data->page, data->npage);
TidyDoc tdoc = tidyCreate();
tidyOptSetBool(tdoc, TidyForceOutput, yes);
tidyOptSetInt(tdoc, TidyWrapLen, 4096);
tidySetErrorBuffer(tdoc, &errbuf);
int rc;
rc = repairdoc(tdoc, &origbuf);
if (rc < 0)
goto cleanup;
debug("encountered errors in doc: %s", errbuf.bp);
extradata_t entry = {
.type = EXTRA_TIDY,
.key = "tidyDoc",
.val = (void*)tdoc,
};
dynarr_push(data->extradata, entry);
cleanup:
tidyBufFree(&errbuf);
// TODO: Safety is unknown
// tdoc still exists while corresponding tidyBuffer is detached
tidyBufDetach(&origbuf);
return rc >= 0 ? 0 : rc;
}
int mod_tidy_onpagedestroy(void *userdata, pagecompletedata_t *data) {
TidyDoc doc = searchextradata(EXTRA_TIDY, "tidyDoc", data->extradata.data, data->extradata.len);
if (doc != NULL)
tidyRelease(doc);
return 0;
}
int mod_tidy_init(crawlermodule_t *entry) {
*entry = (crawlermodule_t) {
.init = entry->init,
.onpagecomplete = mod_tidy_onpagecomplete,
.onpagedestroy = mod_tidy_onpagedestroy,
};
return 0;
}

28
src/module.c Normal file
View File

@ -0,0 +1,28 @@
#include <string.h>
#include "module.h"
moduleentry_t availmodules[] = {
(moduleentry_t) { .name = "mod_tidy", .module = { .init = mod_tidy_init } },
//(moduleentry_t) { .name = "mod_debug", .module = { .init = mod_debug_init } },
(moduleentry_t) { .name = "mod_pagedata", .module = { .init = mod_pagedata_init } },
(moduleentry_t) { .name = "mod_parse", .module = { .init = mod_parse_init } },
(moduleentry_t) { .name = "mod_robots", .module = { .init = mod_robots_init } },
{ 0 },
};
requestedreq_dyanrr_t requestedreqs = { 0 };
void *searchextradata(extradata_type_t type, char *key, extradata_t *data, size_t ndata) {
for (size_t i = 0; i < ndata; i++)
if (data[i].type == type && strcmp(data[i].key, key) == 0)
return data[i].val;
return NULL;
}
void makerequest(const char *url, reqcb_t cb, void *userdata) {
if (requestedreqs.data == NULL)
dynarr_init(requestedreq_dyanrr_t, requestedreqs);
requestedreq_t request = { .url = url, .cb = cb, .userdata = userdata, 0 };
dynarr_push(requestedreqs, request);
}

228
src/util.c Normal file
View File

@ -0,0 +1,228 @@
#include <stdlib.h>
#include <stdarg.h>
#include <stdio.h>
#include <errno.h>
#include <string.h>
#include <stdbool.h>
#include <pthread.h>
#include <unistd.h>
#include <stdint.h>
#include "util.h"
const char *last2path(const char *path) {
const char *prev2 = NULL, *prev = NULL, *cur = path;
for(;;) {
prev2 = prev;
prev = cur;
cur = memchr(cur, '/', strlen(cur));
if (cur++ == NULL)
break;
}
if (prev2 == NULL)
return path;
return prev2;
}
void volog(loglevel_t level, const char *file, int line, const char *fmt, va_list ap) {
char *cc;
switch (level) {
case LEVEL_DEBUG:
cc = "\x1b[0mDEBUG";
break;
case LEVEL_INFO:
cc = "\x1b[0mINFO";
break;
case LEVEL_WARN:
cc = "\x1b[1;35mWARN";
break;
case LEVEL_ERROR:
cc = "\x1b[1;31mERROR";
break;
case LEVEL_FATAL:
cc = "\x1b[38;5;124mFATAL";
break;
}
if (file == NULL)
fprintf(stderr, "%s (unknown file): ", cc);
else
fprintf(stderr, "%s (%s:%d): ", cc, last2path(file), line);
vfprintf(stderr, fmt, ap);
fprintf(stderr, "\x1b[0m\n");
}
void olog(loglevel_t level, const char *file, int line, const char *fmt, ...) {
va_list ap;
va_start(ap, fmt);
volog(level, file, line, fmt, ap);
va_end(ap);
}
void die(const char *fmt, ...) {
va_list ap;
va_start(ap, fmt);
volog(LEVEL_FATAL, NULL, 0, fmt, ap);
va_end(ap);
exit(1);
}
void *xmalloc(size_t size) {
void *ret = malloc(size);
if (ret == NULL)
die("xmalloc failed: %s", strerror(errno));
return ret;
}
void *xcalloc(size_t nmemb, size_t size) {
void *ret = calloc(nmemb, size);
if (ret == NULL)
die("xcalloc failed: %s", strerror(errno));
return ret;
}
void *xrealloc(void *ptr, size_t size) {
void *old = ptr;
ptr = realloc(ptr, size);
if (ptr == NULL)
die("xrealloc failed: %s", strerror(errno));
return ptr;
}
void xfree(void *ptr) {
free(ptr);
}
size_t parityhash(const void *data, size_t ndata) {
size_t sum = 0;
for (size_t i = 0; i < ndata; i++)
sum += *((unsigned char*)data + i);
return sum;
}
char *sanitize2ascii_dyn(const char *inp, size_t maxlen) {
size_t nout = strlen(inp), out_ind = 0;
if (nout > maxlen)
return NULL;
char *out = xmalloc(nout);
char tmp[5];
size_t inp_ind;
for (inp_ind = 0; inp[inp_ind] != '\0'; inp_ind++) {
if (inp[inp_ind] < 0x20 || inp[inp_ind] > 0x7e) {
if (out_ind + 5 >= maxlen)
break;
if (out_ind + 5 >= nout) {
nout *= 2;
out = xrealloc(out, nout);
}
snprintf(tmp, sizeof(tmp), "\\x%02x", inp[inp_ind]);
memcpy(out + out_ind, tmp, 4);
out_ind += 4; // needs to accomidate "\xXX"
}
else if (inp[inp_ind] == '\\' || inp[inp_ind] == '"') {
if (out_ind + 3 >= maxlen)
break;
if (out_ind + 3 >= nout) {
nout *= 2;
out = xrealloc(out, nout);
}
out[out_ind++] = '\\';
out[out_ind++] = inp[inp_ind];
}
else {
if (out_ind + 2 >= maxlen)
break;
if (out_ind + 2 >= nout) {
nout *= 2;
out = xrealloc(out, nout);
}
out[out_ind++] = inp[inp_ind];
}
}
if (inp[inp_ind] != '\0') {
xfree(out);
return NULL;
}
out[out_ind] = '\0';
return out;
}
size_t sanitize2ascii(char *out, const char *inp, size_t outsize) {
size_t out_ind = 0;
char tmp[5];
for (size_t inp_ind = 0; inp[inp_ind] != '\0'; inp_ind++) {
if (inp[inp_ind] < 0x20 || inp[inp_ind] > 0x7e) {
if (out_ind + 5 >= outsize)
break;
snprintf(tmp, sizeof(tmp), "\\x%02x", inp[inp_ind]);
memcpy(out + out_ind, tmp, 4);
out_ind += 4; // needs to accomidate "\xXX"
}
else if (inp[inp_ind] == '\\' || inp[inp_ind] == '"') {
if (out_ind + 3 >= outsize)
break;
out[out_ind++] = '\\';
out[out_ind++] = inp[inp_ind];
}
else {
if (out_ind + 2 >= outsize)
break;
out[out_ind++] = inp[inp_ind];
}
}
out[out_ind] = '\0';
return out_ind;
}
// This is the sample c implementation of MurmurHash taken from Wikipedia. Credit to the Wikipedia article
// of MurmurHash and whoever made the sample implementation on the page.
// https://en.wikipedia.org/w/index.php?title=MurmurHash&oldid=1218923262 accessed on 2024-06-02T18+00:00
// ----- BEGIN WIKIPEDIA SAMPLE CODE -----
static inline uint32_t murmur_32_scramble(uint32_t k) {
k *= 0xcc9e2d51;
k = (k << 15) | (k >> 17);
k *= 0x1b873593;
return k;
}
uint32_t murmur3_32(const uint8_t* key, size_t len, uint32_t seed)
{
uint32_t h = seed;
uint32_t k;
/* Read in groups of 4. */
for (size_t i = len >> 2; i; i--) {
// Here is a source of differing results across endiannesses.
// A swap here has no effects on hash properties though.
memcpy(&k, key, sizeof(uint32_t));
key += sizeof(uint32_t);
h ^= murmur_32_scramble(k);
h = (h << 13) | (h >> 19);
h = h * 5 + 0xe6546b64;
}
/* Read the rest. */
k = 0;
for (size_t i = len & 3; i; i--) {
k <<= 8;
k |= key[i - 1];
}
// A swap is *not* necessary here because the preceding loop already
// places the low bytes in the low places according to whatever endianness
// we use. Swaps only apply when the memory is copied in a chunk.
h ^= murmur_32_scramble(k);
/* Finalize. */
h ^= len;
h ^= h >> 16;
h *= 0x85ebca6b;
h ^= h >> 13;
h *= 0xc2b2ae35;
h ^= h >> 16;
return h;
}
// ----- END WIKIPEDIA SAMPLE CODE -----
bool hset_charp_cmp(char **lhs, char **rhs) {
return strcmp(*lhs, *rhs) == 0;
}
size_t hset_charp_hash(char **str) {
// u32 -> u64?
return (size_t)murmur3_32(*str, strlen(*str), 0x9747b28c);
}

45
tests/deque_pop.c Normal file
View File

@ -0,0 +1,45 @@
#include <stdlib.h>
#include "util.h"
#include "unit.h"
#define TEST_LEN 32768
#define SIZE_MAX (size_t)-1
int tests_deque_pop(int argc, char **argv) {
int *expected_front = xmalloc(TEST_LEN / 2 * sizeof(int));
int *expected_back = xmalloc(TEST_LEN / 2 * sizeof(int));
for (size_t i = 0; i < TEST_LEN / 2; i++)
expected_front[i] = rand();
for (size_t i = 0; i < TEST_LEN / 2; i++)
expected_back[i] = rand();
int_deque_t deq;
deque_init(int_deque_t, deq);
for (size_t i = 0; i < TEST_LEN / 2; i++) {
if (rand() % 2 == 0) {
deque_push_back(deq, expected_back[i]);
deque_push_front(deq, expected_front[TEST_LEN / 2 - i - 1]);
}
else {
deque_push_front(deq, expected_front[TEST_LEN / 2 - i - 1]);
deque_push_back(deq, expected_back[i]);
}
}
int_deque_t from_back, from_front;
deque_clone(from_back, deq);
deque_clone(from_front, deq);
deque_destroy(deq);
for (size_t i = 0; i < 2 * (TEST_LEN / 2); i++) {
int exp = (i >= TEST_LEN / 2) ? expected_back[i - TEST_LEN / 2] : expected_front[i];
chi_assert("from_front value doesnt match", deque_pop_front(from_front) == exp);
}
for (size_t i = 2 * (TEST_LEN / 2) - 1; i != SIZE_MAX; i--) {
int exp = (i >= TEST_LEN / 2) ? expected_back[i - TEST_LEN / 2] : expected_front[i];
chi_assert("from_back value doesnt match", deque_pop_back(from_back) == exp);
}
deque_destroy(from_front);
deque_destroy(from_back);
xfree(expected_back);
xfree(expected_front);
return 0;
}

58
tests/deque_push.c Normal file
View File

@ -0,0 +1,58 @@
#include "util.h"
#include "unit.h"
#define SIZE_MAX (size_t)-1
int tests_deque_push(int argc, char **argv) {
int_deque_t deq;
// push back and front
deque_init(int_deque_t, deq);
deque_push_back(deq, 10);
chi_assert("incorrect back value", deq.base[0] == 10);
deque_push_front(deq, 69);
chi_assert("incorrect back value", deq.base[0] == 10);
chi_assert("incorrect front value", deq.base[deq.cap-1] == 69);
for(size_t i = 0; i < DEQUE_INIT_CAP - 2; i++)
deque_push_front(deq, i);
chi_assert("length and/or capacity incorrect", deq.len == deq.cap && deq.cap == DEQUE_INIT_CAP);
chi_assert("incorrect back value", deq.base[0] == 10);
chi_assert("incorrect front value ", deq.base[deq.cap-1] == 69);
chi_assert("incorrect back", deq.back == 1);
chi_assert("incorrect front", deq.front == 1);
for (size_t i = 2; i < deq.cap - 1; i++)
chi_assert("incorrect ordering", deq.base[i-1] > deq.base[i]);
deque_push_back(deq, 0xee);
chi_assert("length and/or capacity incorrect correct",
deq.len == DEQUE_INIT_CAP + 1 && deq.cap == DEQUE_INIT_CAP * 2);
chi_assert("incorrect back value 1", deq.base[deq.len - 1] == 0xee);
chi_assert("incorrect back value 2", deq.base[deq.len - 2] == 10);
chi_assert("incorrect front value", deq.base[deq.len - 3] == 69);
chi_assert("incorrect back", deq.back == DEQUE_INIT_CAP + 1);
chi_assert("incorrect front", deq.front == 0);
for (size_t i = 1; i < deq.len - 3; i++)
chi_assert("incorrect ordering", deq.base[i-1] > deq.base[i]);
deque_destroy(deq);
// push front
deque_init(int_deque_t, deq);
for (size_t i = 4 * DEQUE_INIT_CAP - 1; i != SIZE_MAX; i--)
deque_push_front(deq, i);
chi_assert("incorrect back and/or front", deq.back == deq.front && deq.back == DEQUE_INIT_CAP * 2);
chi_assert("incorrect length and/or capcity", deq.len == deq.cap && deq.cap == 4 * DEQUE_INIT_CAP);
for (size_t i = 0; i < deq.cap; i++)
chi_assert("incorrect value", *deque_get(deq, i) == i);
deque_destroy(deq);
// push back
deque_init(int_deque_t, deq);
for (size_t i = 0; i < 4 * DEQUE_INIT_CAP; i++)
deque_push_back(deq, i);
chi_assert("incorrect back and/or front", deq.back == deq.front && deq.back == 0);
chi_assert("incorrect length and/or capcity", deq.len == deq.cap && deq.cap == 4 * DEQUE_INIT_CAP);
for (size_t i = 0; i < deq.cap; i++)
chi_assert("incorrect value", *deque_get(deq, i) == i);
deque_destroy(deq);
return 0;
}

133
tests/dynarr.c.old Normal file
View File

@ -0,0 +1,133 @@
#include <string.h>
#include "util.h"
#include "unit.h"
#define TEST_LEN 32768
void get_test(void) {
int_dynarr_t arr = DYNARR_INIT(int_dynarr_t);
for (int i = 0; i < 100; i++)
DYNARR_PUSH(arr, 0);
chi_assert("get(5) == arr.data + 5", DYNARR_GET(arr, 5) == arr.data + 5);
}
void extensions_test(void) {
int *cmp = xmalloc(TEST_LEN * sizeof(int));
for (size_t i = 0; i < TEST_LEN; i++)
cmp[i] = rand();
int_dynarr_t extend = DYNARR_INIT(int_dynarr_t);
DYNARR_EXTEND_FIXED(extend, cmp, TEST_LEN);
chi_assert("extend.data != cmp", memcmp(extend.data, cmp, TEST_LEN * sizeof(int)) == 0);
chi_assert("extend.len != TEST_LEN", extend.len == TEST_LEN);
DYNARR_DEINIT(extend);
int_dynarr_t push = DYNARR_INIT(int_dynarr_t);
for (size_t i = 0; i < TEST_LEN; i++)
DYNARR_PUSH(push, cmp[i]);
chi_assert("push.data != cmp", memcmp(push.data, cmp, TEST_LEN * sizeof(int)) == 0);
chi_assert("push.len != TEST_LEN", push.len == TEST_LEN);
DYNARR_DEINIT(push);
int_dynarr_t both = DYNARR_INIT(int_dynarr_t);
DYNARR_EXTEND_FIXED(both, cmp, TEST_LEN / 2);
for (size_t i = TEST_LEN / 2; i < TEST_LEN; i++)
DYNARR_PUSH(both, cmp[i]);
chi_assert("both.data != cmp", memcmp(both.data, cmp, TEST_LEN * sizeof(int)) == 0);
chi_assert("both.len != TEST_LEN", both.len == TEST_LEN);
DYNARR_DEINIT(both);
xfree(cmp);
}
void insert_test(void) {
size_t_dynarr_t increm = DYNARR_INIT(size_t_dynarr_t);
for (size_t i = 0; i < TEST_LEN; i += 2)
DYNARR_PUSH(increm, i);
for (size_t i = 1; i < TEST_LEN; i += 2)
DYNARR_INSERT(increm, i, i);
chi_assert("arr.len == TEST_LEN", increm.len == TEST_LEN);
for (size_t i = 0; i < TEST_LEN; i++)
chi_assert("arr[i] == i", *DYNARR_GET(increm, i) == i);
DYNARR_DEINIT(increm);
int_dynarr_t randins = DYNARR_INIT(int_dynarr_t);
long long parity = 0, check = 0;
for (size_t i = 0; i < TEST_LEN; i++) {
int gen = rand() % 10;
parity += gen;
DYNARR_INSERT(randins, (rand() % (randins.len + 1)), gen);
}
chi_assert("arr.len == TEST_LEN", randins.len == TEST_LEN);
for (size_t i = 0; i < TEST_LEN; i++)
check += *DYNARR_GET(randins, i);
chi_assert("parity == check", parity == check);
DYNARR_DEINIT(randins);
}
void check_arr(int check[], size_t check_len) {
int_dynarr_t dyn = DYNARR_INIT(int_dynarr_t);
DYNARR_EXTEND_FIXED(dyn, check, check_len);
chi_assert("dyn.len == check_len", dyn.len == check_len);
chi_assert("dyn.data == check", memcmp(dyn.data, check, check_len * sizeof(int)) == 0);
for (size_t i = 0; i < dyn.len; i++) {
size_t ind = rand() % check_len;
DYNARR_REMOVE(dyn, ind);
memmove(check + ind, check + ind + 1, (check_len - ind - 1) * sizeof(int));
check_len -= 1;
chi_assert("dyn.len == check_len (modified)", dyn.len == check_len);
chi_assert("dyn.data == check (modified)", memcmp(dyn.data, check, check_len * sizeof(int)) == 0);
}
}
void remove_test(void) {
int_dynarr_t randdel = DYNARR_INIT(int_dynarr_t);
long long parity = 0;
for (size_t i = 0; i < TEST_LEN; i++) {
int gen = rand() % 10;
parity += gen;
DYNARR_PUSH(randdel, gen);
}
long long check = parity;
for (size_t i = 0; i < TEST_LEN; i++) {
size_t ind = rand() % randdel.len;
check -= *DYNARR_GET(randdel, ind);
DYNARR_REMOVE(randdel, ind);
chi_assert("randdel.len == TEST_LEN - i - 1", randdel.len == TEST_LEN - i - 1);
}
chi_assert("check == 0", check == 0);
int *c = xmalloc(TEST_LEN * sizeof(int));
for (int i = 0; i < TEST_LEN; i++)
c[i] = rand();
check_arr(c, TEST_LEN);
xfree(c);
DYNARR_DEINIT(randdel);
}
int main(int argc, char **argv) {
if (argc != 2)
return 1;
if (strcmp(argv[1], "push") == 0) {
push_test();
} else if (strcmp(argv[1], "get") == 0) {
get_test();
} else if (strcmp(argv[1], "extensions") == 0) {
extensions_test();
} else if (strcmp(argv[1], "insert") == 0) {
insert_test();
} else if (strcmp(argv[1], "remove") == 0) {
remove_test();
}
}

34
tests/dynarr_extensions.c Normal file
View File

@ -0,0 +1,34 @@
#include "util.h"
#include "unit.h"
#define TEST_LEN 32768
int tests_dynarr_extensions(int argc, char **argv) {
int *cmp = xmalloc(TEST_LEN * sizeof(int));
for (size_t i = 0; i < TEST_LEN; i++)
cmp[i] = rand();
int_dynarr_t extend = dynarr_initi(int_dynarr_t);
dynarr_extend_fixed(extend, cmp, TEST_LEN);
chi_assert("extend.data != cmp", memcmp(extend.data, cmp, TEST_LEN * sizeof(int)) == 0);
chi_assert("extend.len != TEST_LEN", extend.len == TEST_LEN);
dynarr_destroy(extend);
int_dynarr_t push = dynarr_initi(int_dynarr_t);
for (size_t i = 0; i < TEST_LEN; i++)
dynarr_push(push, cmp[i]);
chi_assert("push.data != cmp", memcmp(push.data, cmp, TEST_LEN * sizeof(int)) == 0);
chi_assert("push.len != TEST_LEN", push.len == TEST_LEN);
dynarr_destroy(push);
int_dynarr_t both = dynarr_initi(int_dynarr_t);
dynarr_extend_fixed(both, cmp, TEST_LEN / 2);
for (size_t i = TEST_LEN / 2; i < TEST_LEN; i++)
dynarr_push(both, cmp[i]);
chi_assert("both.data != cmp", memcmp(both.data, cmp, TEST_LEN * sizeof(int)) == 0);
chi_assert("both.len != TEST_LEN", both.len == TEST_LEN);
dynarr_destroy(both);
xfree(cmp);
return 0;
}

13
tests/dynarr_get.c Normal file
View File

@ -0,0 +1,13 @@
#include "util.h"
#include "unit.h"
#define TEST_LEN 32768
int tests_dynarr_get(int argc, char **argv) {
int_dynarr_t arr = dynarr_initi(int_dynarr_t);
for (int i = 0; i < 100; i++)
dynarr_push(arr, 0);
chi_assert("get(5) == arr.data + 5", dynarr_get(arr, 5) == arr.data + 5);
dynarr_destroy(arr);
return 0;
}

View File

@ -0,0 +1,7 @@
#include "util.h"
int tests_dynarr_get1_death(int argc, char **argv) {
int_dynarr_t a = dynarr_initi(int_dynarr_t);
dynarr_get(a, 0);
return 0;
}

View File

@ -0,0 +1,9 @@
#include "util.h"
int tests_dynarr_get2_death(int argc, char **argv) {
int_dynarr_t a = dynarr_initi(int_dynarr_t);
for (int i = 0; i < 1000; i++)
dynarr_push(a, i);
dynarr_get(a, a.len);
return 0;
}

10
tests/dynarr_get3_death.c Normal file
View File

@ -0,0 +1,10 @@
#include "util.h"
int tests_dynarr_get3_death(int argc, char **argv) {
int_dynarr_t a = dynarr_initi(int_dynarr_t);
for (int i = 0; i < 1000; i++)
dynarr_push(a, i);
int k = -1;
dynarr_get(a, k);
return 0;
}

35
tests/dynarr_insert.c Normal file
View File

@ -0,0 +1,35 @@
#include "util.h"
#include "unit.h"
#define TEST_LEN 32768
int tests_dynarr_insert(int argc, char **argv) {
size_dynarr_t increm = dynarr_initi(size_dynarr_t);
for (size_t i = 0; i < TEST_LEN; i += 2)
dynarr_push(increm, i);
for (size_t i = 1; i < TEST_LEN; i += 2)
dynarr_insert(increm, i, i);
chi_assert("arr.len == TEST_LEN", increm.len == TEST_LEN);
for (size_t i = 0; i < TEST_LEN; i++)
chi_assert("arr[i] == i", *dynarr_get(increm, i) == i);
dynarr_destroy(increm);
int_dynarr_t randins = dynarr_initi(int_dynarr_t);
long long parity = 0, check = 0;
for (size_t i = 0; i < TEST_LEN; i++) {
int gen = rand() % 10;
parity += gen;
dynarr_insert(randins, (rand() % (randins.len + 1)), gen);
}
chi_assert("arr.len == TEST_LEN", randins.len == TEST_LEN);
for (size_t i = 0; i < TEST_LEN; i++)
check += *dynarr_get(randins, i);
chi_assert("parity == check", parity == check);
dynarr_destroy(randins);
return 0;
}

50
tests/dynarr_remove.c Normal file
View File

@ -0,0 +1,50 @@
#include "util.h"
#include "unit.h"
#define TEST_LEN 32768
void check_arr(int check[], size_t check_len) {
int_dynarr_t dyn = dynarr_initi(int_dynarr_t);
dynarr_extend_fixed(dyn, check, check_len);
chi_assert("dyn.len == check_len", dyn.len == check_len);
chi_assert("dyn.data == check", memcmp(dyn.data, check, check_len * sizeof(int)) == 0);
for (size_t i = 0; i < dyn.len; i++) {
size_t ind = rand() % check_len;
dynarr_remove(dyn, ind);
memmove(check + ind, check + ind + 1, (check_len - ind - 1) * sizeof(int));
check_len -= 1;
chi_assert("dyn.len == check_len (modified)", dyn.len == check_len);
chi_assert("dyn.data == check (modified)", memcmp(dyn.data, check, check_len * sizeof(int)) == 0);
}
}
int tests_dynarr_remove(int argc, char **argv) {
int_dynarr_t randdel = dynarr_initi(int_dynarr_t);
long long parity = 0;
for (size_t i = 0; i < TEST_LEN; i++) {
int gen = rand() % 10;
parity += gen;
dynarr_push(randdel, gen);
}
long long check = parity;
for (size_t i = 0; i < TEST_LEN; i++) {
size_t ind = rand() % randdel.len;
check -= *dynarr_get(randdel, ind);
dynarr_remove(randdel, ind);
chi_assert("randdel.len == TEST_LEN - i - 1", randdel.len == TEST_LEN - i - 1);
}
chi_assert("check == 0", check == 0);
int *c = xmalloc(TEST_LEN * sizeof(int));
for (int i = 0; i < TEST_LEN; i++)
c[i] = rand();
check_arr(c, TEST_LEN);
xfree(c);
dynarr_destroy(randdel);
return 0;
}

69
tests/hset_add.c Normal file
View File

@ -0,0 +1,69 @@
#include <stdbool.h>
#include <string.h>
#include "util.h"
#include "unit.h"
size_t charp_parityhash(char **ptr) {
return parityhash(*ptr, strlen(*ptr));
}
bool charp_cmp(char **lhs, char **rhs) {
return strcmp(*lhs, *rhs) == 0;
}
int tests_hset_add(int argc, char **argv) {
charp_hset_t hset1;
hset_init(charp_hset_t, hset1, charp_parityhash, charp_cmp);
char *ptr = "hello";
chi_assert("\"hello\" should not be apart of the set", !hset_add(hset1, ptr));
chi_assert("\"hello\" should be apart of the set", hset_add(hset1, ptr));
ptr = "ehllo";
chi_assert("\"ehllo\" should not be apart of the set", !hset_add(hset1, ptr));
char *heap = xmalloc(strlen(ptr) + 1);
strcpy(heap, ptr);
chi_assert("\"ehllo\" should be apart of the set", hset_add(hset1, heap));
xfree(heap);
hset_destroy(hset1);
charp_hset_t hset2;
hset_init(charp_hset_t, hset2, charp_parityhash, charp_cmp);
#define MAX_STR_SIZE 3
ptr = xmalloc(MAX_STR_SIZE + 2);
strcpy(ptr, "0");
char buf[100];
for(int len = 1; len < MAX_STR_SIZE + 1;) {
snprintf(buf, sizeof(buf)-1, "\"%s\" shouldn't be apart of the set", ptr);
char *tmp = xmalloc(strlen(ptr) + 1);
strcpy(tmp, ptr);
chi_assert(buf, !hset_add(hset2, tmp));
int c = 1;
for (int i = len - 1, s = 0; i >= 0; i--)
s = (ptr[i] - '0') + c, c = s > 9, ptr[i] = s % 10 + '0';
if (c != 0) {
memmove(ptr + 1, ptr, (len++) + 1);
*ptr = '1';
}
}
strcpy(ptr, "0");
for(int len = 1; len < MAX_STR_SIZE + 1;) {
snprintf(buf, sizeof(buf)-1, "\"%s\" should be apart of the set", ptr);
chi_assert(buf, hset_find(hset2, ptr));
chi_assert(buf, hset_add(hset2, ptr));
int c = 1;
for (int i = len - 1, s = 0; i >= 0; i--)
s = (ptr[i] - '0') + c, c = s > 9, ptr[i] = s % 10 + '0';
if (c != 0) {
memmove(ptr + 1, ptr, (len++) + 1);
*ptr = '1';
}
}
xfree(ptr);
hset_destroy(hset2);
return 0;
}

34
tests/hset_iter.c Normal file
View File

@ -0,0 +1,34 @@
#include <stdbool.h>
#include <string.h>
#include "util.h"
#include "unit.h"
size_t int_hash(int *ptr) {
return *ptr;
}
int tests_hset_iter(int argc, char **argv) {
int_hset_t hset1, hset2;
hset_init(int_hset_t, hset1, int_hash, NULL);
hset_init(int_hset_t, hset2, int_hash, NULL);
for(int i = 0; i < 1000; i++)
hset_add(hset1, i);
void *saveptr = NULL;
int *data;
while ((data = hset_iter(hset2, saveptr)) != NULL)
hset_add(hset2, *data);
char msgbuf[100];
for(int i = 0; i < 1000; i++) {
snprintf(msgbuf, sizeof(msgbuf), "%d not in hset2", i);
chi_assert(msgbuf, hset_find(hset2, i));
}
hset_destroy(hset2);
hset_destroy(hset1);
return 0;
}

78
tests/json_write.c Normal file
View File

@ -0,0 +1,78 @@
#include <stdio.h>
#include <errno.h>
#include <string.h>
#include <unistd.h>
#include "json.h"
#include "unit.h"
#define CMP_STR "{\"string\":\"hello\",\"number\":69,\"boolean\":true,\"null\":" \
"null,\"null\":null,\"array\":[\"hello\",\"world\",\"it's\",\"max\"," \
"\"flow\",\"with\",\"ryhmes\",\"so-so\",0,-100,false,null],\"object\":{\"hello\":" \
"\"dipshit\"}}"
int tests_json_write(int argc, char **argv) {
jsonkv_dynarr_t map = dynarr_initi(jsonkv_dynarr_t);
jsonkv_t kv;
kv = (jsonkv_t){ .key = "string", .val = json_createstr("hello") };
dynarr_push(map, kv);
kv = (jsonkv_t){ .key = "number", .val = json_createint(69) };
dynarr_push(map, kv);
kv = (jsonkv_t){ .key = "boolean", .val = json_createbool(true) };
dynarr_push(map, kv);
kv = (jsonkv_t){ .key = "null", .val = json_createnull() };
dynarr_push(map, kv);
dynarr_push(map, kv);
jsonval_dynarr_t subarr = dynarr_initi(jsonval_dynarr_t);
dynarr_push(subarr, json_createstr("hello"));
dynarr_push(subarr, json_createstr("world"));
dynarr_push(subarr, json_createstr("it's"));
dynarr_push(subarr, json_createstr("max"));
dynarr_push(subarr, json_createstr("flow"));
dynarr_push(subarr, json_createstr("with"));
dynarr_push(subarr, json_createstr("ryhmes"));
dynarr_push(subarr, json_createstr("so-so"));
dynarr_push(subarr, json_createint(0));
dynarr_push(subarr, json_createint(-100));
dynarr_push(subarr, json_createbool(false));
dynarr_push(subarr, json_createnull());
jsonval_t j_subarr = json_createarr(subarr);
kv = (jsonkv_t){ .key = "array", .val = j_subarr };
dynarr_push(map, kv);
jsonkv_dynarr_t submap = dynarr_initi(jsonkv_dynarr_t);
kv = (jsonkv_t){ .key = "hello", .val = json_createstr("dipshit") };
dynarr_push(submap, kv);
kv = (jsonkv_t){ .key = "object", .val = json_createobj(submap) };
dynarr_push(map, kv);
jsonval_t j_map = json_createobj(map);
int fds[2] = { 11, 12 };
if (pipe(fds) < 0) {
fprintf(stderr, "pipe() failed: %s\n", strerror(errno));
return 1;
}
FILE *writer = fdopen(fds[1], "w");
if (writer == NULL) {
fprintf(stderr, "pipe() failed: %s\n", strerror(errno));
return 1;
}
json_write(writer, &j_map);
json_destroy(&j_map);
char buf[1000];
fflush(writer);
int cnt = read(fds[0], &buf, sizeof(buf));
if (cnt < 0) {
fprintf(stderr, "read() failed: %s\n", strerror(errno));
return 1;
}
buf[cnt] = '\0';
printf("cmp: %s\n", CMP_STR);
printf("buf: %s\n", buf);
chi_assert("test strings do not match", strcmp(buf, CMP_STR) == 0);
fclose(writer);
close(fds[0]);
return 0;
}

0
tests/robots_txt.c Normal file
View File

3
tests/unit.h Normal file
View File

@ -0,0 +1,3 @@
#include <stdio.h>
#include <stdlib.h>
#define chi_assert(message, test) do { if (!(test)) { fprintf(stderr, "ASSERT FAILED (line %d): %s\n", __LINE__, (message)); exit(1); } } while(0)