Spider2/src/http.c

153 lines
6.2 KiB
C

#include <ctype.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>
#include <curl/curl.h>
#include "http.h"
#include "util.h"
#define INIT_PAGE_SIZE 8192
#define MAX_PAGE_SIZE 1048576
#define MEM_INC_FACTOR 2
#define TIMEOUT_MS 10000
#define CONNECT_TIMEOUT_MS 3000
const char *useragents[] = {
"AvaBot",
NULL,
};
size_t bufwritecb(const byte *ptr, size_t size, size_t nmemb, writecb_data_t *userdata) {
for (size_t i = 0; i < userdata->modules->len; i++) {
if (userdata->modules->data[i]->module.onpagewrite != NULL) {
int rc;
rc = userdata->modules->data[i]->module.onpagewrite(
userdata->modules->data[i]->module.userdata, userdata->url, ptr, nmemb);
if (rc != 0)
error("module %s onpagewrite failed with code %d", userdata->modules->data[i]->name);
}
}
size_t len = nmemb;
while (userdata->begin + nmemb + 1 > userdata->end) {
// Buffer is undersized
size_t buf_len = userdata->end - userdata->base;
if (buf_len >= MAX_PAGE_SIZE)
break;
size_t new_buf_len = buf_len * MEM_INC_FACTOR;
if (new_buf_len > MAX_PAGE_SIZE)
new_buf_len = MAX_PAGE_SIZE;
byte *new_base = xrealloc(userdata->base, new_buf_len);
userdata->end = new_base + new_buf_len;
userdata->begin = new_base + (userdata->begin - userdata->base);
userdata->base = new_base;
}
if (userdata->begin + nmemb + 1 > userdata->end)
// Buffer is still undersized
len = userdata->end - userdata->begin;
memcpy(userdata->begin, ptr, len);
userdata->begin[len] = '\0';
userdata->begin += len;
return len;
}
bool is_redirect(int status) {
return status == 301 || // Moved Permanently
status == 302 || // Found
status == 307 || // Temporary Redirect
status == 308; // Permanent Redirect
}
size_t headerwritecb(const char *buffer, size_t _size, size_t nitems, headercb_data_t *userdata) {
const char content_type_str[] = "content-type:", html_mime_str[] = "text/html", http_str[] = "HTTP/";
// Parses HTTP status line
if (nitems < sizeof(http_str) - 1)
return nitems;
if (memcmp(buffer, http_str, sizeof(http_str) - 1) == 0) {
// Header is an http status line
userdata->num_requests++;
const char *status_line = memchr(buffer, ' ', nitems);
for (; *status_line == ' '; status_line++) {
// Ensures that status_line...(buffer+nitems) can fit a status code (3 numbers)
if (status_line > buffer + nitems - 3) {
userdata->status = 0;
return CURL_WRITEFUNC_ERROR;
}
}
char code_str[4] = { status_line[0], status_line[1], status_line[2], '\0' };
userdata->status = atoi(code_str);
if (userdata->status == 200 || is_redirect(userdata->status))
return nitems;
return CURL_WRITEFUNC_ERROR;
}
if (userdata->status == 0 || is_redirect(userdata->status))
return nitems;
// Parses Content-Type header
if (userdata->flags & HEADERCB_CONTENT_TYPE_ENCOUNTERED)
return nitems;
// We need an extra byte to plop header_val to the byte after the ':'
if (nitems < sizeof(content_type_str))
return nitems;
for (size_t i = 0; i < sizeof(content_type_str)-1; i++)
if (tolower(buffer[i]) != content_type_str[i])
return nitems;
const char *header_val = buffer + sizeof(content_type_str);
for (; *header_val == ' '; header_val++)
// Ensures that header_val..(buffer+nitems) can fit "text/html"
if (header_val > buffer + nitems - sizeof(html_mime_str) + 1)
return nitems;
userdata->flags |= HEADERCB_CONTENT_TYPE_ENCOUNTERED;
if (memcmp(header_val, html_mime_str, sizeof(html_mime_str)-1) != 0)
return CURL_WRITEFUNC_ERROR;
userdata->flags |= HEADERCB_VALID_MIME;
return nitems;
}
void initcbdata(const char *url, moduleentryp_dynarr_t *modules, cbdata_t *data) {
memset(data, 0, sizeof(cbdata_t));
data->writecb_data.base = xmalloc(INIT_PAGE_SIZE);
data->writecb_data.begin = data->writecb_data.base;
data->writecb_data.end = data->writecb_data.base + INIT_PAGE_SIZE;
data->writecb_data.url = url;
data->writecb_data.modules = modules;
}
CURL *makehandle(const char *url, hostentry_t *host_entry, cbdata_t *cbdata, bool wasrequested) {
CURL *curl_h = curl_easy_init();
CURLcode easy_res;
if (curl_h == NULL) {
error("curl failed to initialize\n");
return NULL;
}
if (/* 1MiB/s max send speed */
(easy_res = curl_easy_setopt(curl_h, CURLOPT_MAX_SEND_SPEED_LARGE, 1024 * 1024)) != CURLE_OK ||
/* 1MiB/s max recv speed */
(easy_res = curl_easy_setopt(curl_h, CURLOPT_MAX_RECV_SPEED_LARGE, 1024 * 1024)) != CURLE_OK ||
(easy_res = curl_easy_setopt(curl_h, CURLOPT_TIMEOUT_MS, TIMEOUT_MS)) != CURLE_OK ||
(easy_res = curl_easy_setopt(curl_h, CURLOPT_CONNECTTIMEOUT_MS, CONNECT_TIMEOUT_MS)) != CURLE_OK ||
(easy_res = curl_easy_setopt(curl_h, CURLOPT_FOLLOWLOCATION, 1)) != CURLE_OK ||
(easy_res = curl_easy_setopt(curl_h, CURLOPT_MAXREDIRS, 3)) != CURLE_OK ||
(easy_res = curl_easy_setopt(curl_h, CURLOPT_HTTPHEADER, host_entry->headers)) != CURLE_OK ||
(easy_res = curl_easy_setopt(curl_h, CURLOPT_PROTOCOLS_STR, "http,https")) != CURLE_OK ||
(easy_res = curl_easy_setopt(curl_h, CURLOPT_WRITEFUNCTION, bufwritecb)) != CURLE_OK ||
(easy_res = curl_easy_setopt(curl_h, CURLOPT_WRITEDATA, &cbdata->writecb_data)) != CURLE_OK ||
(easy_res = curl_easy_setopt(curl_h, CURLOPT_URL, url) != CURLE_OK)) {
error("curl setopt failed: %s (code %d)\n", curl_easy_strerror(easy_res), easy_res);
curl_easy_cleanup(curl_h);
return NULL;
}
if (!wasrequested &&
((easy_res = curl_easy_setopt(curl_h, CURLOPT_HEADERFUNCTION, headerwritecb)) != CURLE_OK ||
(easy_res = curl_easy_setopt(curl_h, CURLOPT_HEADERDATA, &cbdata->headercb_data)) != CURLE_OK)) {
error("curl setopt failed: %s (code %d)\n", curl_easy_strerror(easy_res), easy_res);
curl_easy_cleanup(curl_h);
return NULL;
}
return curl_h;
}