153 lines
6.2 KiB
C
153 lines
6.2 KiB
C
#include <ctype.h>
|
|
#include <string.h>
|
|
#include <stdlib.h>
|
|
#include <stdio.h>
|
|
#include <stdbool.h>
|
|
#include <curl/curl.h>
|
|
|
|
#include "http.h"
|
|
#include "util.h"
|
|
|
|
#define INIT_PAGE_SIZE 8192
|
|
#define MAX_PAGE_SIZE 1048576
|
|
#define MEM_INC_FACTOR 2
|
|
|
|
#define TIMEOUT_MS 10000
|
|
#define CONNECT_TIMEOUT_MS 3000
|
|
|
|
const char *useragents[] = {
|
|
"AvaBot",
|
|
NULL,
|
|
};
|
|
|
|
size_t bufwritecb(const byte *ptr, size_t size, size_t nmemb, writecb_data_t *userdata) {
|
|
for (size_t i = 0; i < userdata->modules->len; i++) {
|
|
if (userdata->modules->data[i]->module.onpagewrite != NULL) {
|
|
int rc;
|
|
rc = userdata->modules->data[i]->module.onpagewrite(
|
|
userdata->modules->data[i]->module.userdata, userdata->url, ptr, nmemb);
|
|
if (rc != 0)
|
|
error("module %s onpagewrite failed with code %d", userdata->modules->data[i]->name);
|
|
}
|
|
}
|
|
size_t len = nmemb;
|
|
while (userdata->begin + nmemb + 1 > userdata->end) {
|
|
// Buffer is undersized
|
|
size_t buf_len = userdata->end - userdata->base;
|
|
if (buf_len >= MAX_PAGE_SIZE)
|
|
break;
|
|
size_t new_buf_len = buf_len * MEM_INC_FACTOR;
|
|
if (new_buf_len > MAX_PAGE_SIZE)
|
|
new_buf_len = MAX_PAGE_SIZE;
|
|
byte *new_base = xrealloc(userdata->base, new_buf_len);
|
|
userdata->end = new_base + new_buf_len;
|
|
userdata->begin = new_base + (userdata->begin - userdata->base);
|
|
userdata->base = new_base;
|
|
}
|
|
if (userdata->begin + nmemb + 1 > userdata->end)
|
|
// Buffer is still undersized
|
|
len = userdata->end - userdata->begin;
|
|
memcpy(userdata->begin, ptr, len);
|
|
userdata->begin[len] = '\0';
|
|
userdata->begin += len;
|
|
return len;
|
|
}
|
|
|
|
bool is_redirect(int status) {
|
|
return status == 301 || // Moved Permanently
|
|
status == 302 || // Found
|
|
status == 307 || // Temporary Redirect
|
|
status == 308; // Permanent Redirect
|
|
}
|
|
|
|
size_t headerwritecb(const char *buffer, size_t _size, size_t nitems, headercb_data_t *userdata) {
|
|
const char content_type_str[] = "content-type:", html_mime_str[] = "text/html", http_str[] = "HTTP/";
|
|
// Parses HTTP status line
|
|
if (nitems < sizeof(http_str) - 1)
|
|
return nitems;
|
|
if (memcmp(buffer, http_str, sizeof(http_str) - 1) == 0) {
|
|
// Header is an http status line
|
|
userdata->num_requests++;
|
|
const char *status_line = memchr(buffer, ' ', nitems);
|
|
for (; *status_line == ' '; status_line++) {
|
|
// Ensures that status_line...(buffer+nitems) can fit a status code (3 numbers)
|
|
if (status_line > buffer + nitems - 3) {
|
|
userdata->status = 0;
|
|
return CURL_WRITEFUNC_ERROR;
|
|
}
|
|
}
|
|
char code_str[4] = { status_line[0], status_line[1], status_line[2], '\0' };
|
|
userdata->status = atoi(code_str);
|
|
if (userdata->status == 200 || is_redirect(userdata->status))
|
|
return nitems;
|
|
return CURL_WRITEFUNC_ERROR;
|
|
}
|
|
if (userdata->status == 0 || is_redirect(userdata->status))
|
|
return nitems;
|
|
// Parses Content-Type header
|
|
if (userdata->flags & HEADERCB_CONTENT_TYPE_ENCOUNTERED)
|
|
return nitems;
|
|
// We need an extra byte to plop header_val to the byte after the ':'
|
|
if (nitems < sizeof(content_type_str))
|
|
return nitems;
|
|
for (size_t i = 0; i < sizeof(content_type_str)-1; i++)
|
|
if (tolower(buffer[i]) != content_type_str[i])
|
|
return nitems;
|
|
const char *header_val = buffer + sizeof(content_type_str);
|
|
for (; *header_val == ' '; header_val++)
|
|
// Ensures that header_val..(buffer+nitems) can fit "text/html"
|
|
if (header_val > buffer + nitems - sizeof(html_mime_str) + 1)
|
|
return nitems;
|
|
userdata->flags |= HEADERCB_CONTENT_TYPE_ENCOUNTERED;
|
|
if (memcmp(header_val, html_mime_str, sizeof(html_mime_str)-1) != 0)
|
|
return CURL_WRITEFUNC_ERROR;
|
|
userdata->flags |= HEADERCB_VALID_MIME;
|
|
return nitems;
|
|
}
|
|
|
|
void initcbdata(const char *url, moduleentryp_dynarr_t *modules, cbdata_t *data) {
|
|
memset(data, 0, sizeof(cbdata_t));
|
|
data->writecb_data.base = xmalloc(INIT_PAGE_SIZE);
|
|
data->writecb_data.begin = data->writecb_data.base;
|
|
data->writecb_data.end = data->writecb_data.base + INIT_PAGE_SIZE;
|
|
data->writecb_data.url = url;
|
|
data->writecb_data.modules = modules;
|
|
}
|
|
|
|
CURL *makehandle(const char *url, hostentry_t *host_entry, cbdata_t *cbdata, bool wasrequested) {
|
|
CURL *curl_h = curl_easy_init();
|
|
CURLcode easy_res;
|
|
if (curl_h == NULL) {
|
|
error("curl failed to initialize\n");
|
|
return NULL;
|
|
}
|
|
if (/* 1MiB/s max send speed */
|
|
(easy_res = curl_easy_setopt(curl_h, CURLOPT_MAX_SEND_SPEED_LARGE, 1024 * 1024)) != CURLE_OK ||
|
|
/* 1MiB/s max recv speed */
|
|
(easy_res = curl_easy_setopt(curl_h, CURLOPT_MAX_RECV_SPEED_LARGE, 1024 * 1024)) != CURLE_OK ||
|
|
(easy_res = curl_easy_setopt(curl_h, CURLOPT_TIMEOUT_MS, TIMEOUT_MS)) != CURLE_OK ||
|
|
(easy_res = curl_easy_setopt(curl_h, CURLOPT_CONNECTTIMEOUT_MS, CONNECT_TIMEOUT_MS)) != CURLE_OK ||
|
|
(easy_res = curl_easy_setopt(curl_h, CURLOPT_FOLLOWLOCATION, 1)) != CURLE_OK ||
|
|
(easy_res = curl_easy_setopt(curl_h, CURLOPT_MAXREDIRS, 3)) != CURLE_OK ||
|
|
(easy_res = curl_easy_setopt(curl_h, CURLOPT_HTTPHEADER, host_entry->headers)) != CURLE_OK ||
|
|
(easy_res = curl_easy_setopt(curl_h, CURLOPT_PROTOCOLS_STR, "http,https")) != CURLE_OK ||
|
|
(easy_res = curl_easy_setopt(curl_h, CURLOPT_WRITEFUNCTION, bufwritecb)) != CURLE_OK ||
|
|
(easy_res = curl_easy_setopt(curl_h, CURLOPT_WRITEDATA, &cbdata->writecb_data)) != CURLE_OK ||
|
|
(easy_res = curl_easy_setopt(curl_h, CURLOPT_URL, url) != CURLE_OK)) {
|
|
error("curl setopt failed: %s (code %d)\n", curl_easy_strerror(easy_res), easy_res);
|
|
curl_easy_cleanup(curl_h);
|
|
return NULL;
|
|
}
|
|
|
|
if (!wasrequested &&
|
|
((easy_res = curl_easy_setopt(curl_h, CURLOPT_HEADERFUNCTION, headerwritecb)) != CURLE_OK ||
|
|
(easy_res = curl_easy_setopt(curl_h, CURLOPT_HEADERDATA, &cbdata->headercb_data)) != CURLE_OK)) {
|
|
error("curl setopt failed: %s (code %d)\n", curl_easy_strerror(easy_res), easy_res);
|
|
curl_easy_cleanup(curl_h);
|
|
return NULL;
|
|
}
|
|
return curl_h;
|
|
}
|
|
|
|
|