blob: aee6647e35a6efe3f4a16a2509b10f5218fd6f02 [file] [log] [blame] [raw]
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include "alloc-util.h"
#include "sort-util.h"
#include "strbuf.h"
/*
* Strbuf stores given strings in a single continuous allocated memory
* area. Identical strings are de-duplicated and return the same offset
* as the first string stored. If the tail of a string already exists
* in the buffer, the tail is returned.
*
* A trie (http://en.wikipedia.org/wiki/Trie) is used to maintain the
* information about the stored strings.
*
* Example of udev rules:
* $ ./udevadm test .
* ...
* read rules file: /usr/lib/udev/rules.d/99-systemd.rules
* rules contain 196608 bytes tokens (16384 * 12 bytes), 39742 bytes strings
* 23939 strings (207859 bytes), 20404 de-duplicated (171653 bytes), 3536 trie nodes used
* ...
*/
struct strbuf *strbuf_new(void) {
struct strbuf *str;
str = new(struct strbuf, 1);
if (!str)
return NULL;
*str = (struct strbuf) {
.buf = new0(char, 1),
.root = new0(struct strbuf_node, 1),
.len = 1,
.nodes_count = 1,
};
if (!str->buf || !str->root) {
free(str->buf);
free(str->root);
return mfree(str);
}
return str;
}
static struct strbuf_node* strbuf_node_cleanup(struct strbuf_node *node) {
size_t i;
for (i = 0; i < node->children_count; i++)
strbuf_node_cleanup(node->children[i].child);
free(node->children);
return mfree(node);
}
/* clean up trie data, leave only the string buffer */
void strbuf_complete(struct strbuf *str) {
if (!str)
return;
if (str->root)
str->root = strbuf_node_cleanup(str->root);
}
/* clean up everything */
void strbuf_cleanup(struct strbuf *str) {
if (!str)
return;
strbuf_complete(str);
free(str->buf);
free(str);
}
static int strbuf_children_cmp(const struct strbuf_child_entry *n1,
const struct strbuf_child_entry *n2) {
return n1->c - n2->c;
}
static void bubbleinsert(struct strbuf_node *node,
uint8_t c,
struct strbuf_node *node_child) {
struct strbuf_child_entry new = {
.c = c,
.child = node_child,
};
int left = 0, right = node->children_count;
while (right > left) {
int middle = (right + left) / 2 ;
if (strbuf_children_cmp(&node->children[middle], &new) <= 0)
left = middle + 1;
else
right = middle;
}
memmove(node->children + left + 1, node->children + left,
sizeof(struct strbuf_child_entry) * (node->children_count - left));
node->children[left] = new;
node->children_count++;
}
/* add string, return the index/offset into the buffer */
ssize_t strbuf_add_string(struct strbuf *str, const char *s, size_t len) {
uint8_t c;
struct strbuf_node *node;
size_t depth;
char *buf_new;
struct strbuf_child_entry *child;
struct strbuf_node *node_child;
ssize_t off;
if (!str->root)
return -EINVAL;
/* search string; start from last character to find possibly matching tails */
str->in_count++;
if (len == 0) {
str->dedup_count++;
return 0;
}
str->in_len += len;
node = str->root;
for (depth = 0; depth <= len; depth++) {
struct strbuf_child_entry search;
/* match against current node */
off = node->value_off + node->value_len - len;
if (depth == len || (node->value_len >= len && memcmp(str->buf + off, s, len) == 0)) {
str->dedup_len += len;
str->dedup_count++;
return off;
}
c = s[len - 1 - depth];
/* lookup child node */
search.c = c;
child = typesafe_bsearch(&search, node->children, node->children_count, strbuf_children_cmp);
if (!child)
break;
node = child->child;
}
/* add new string */
buf_new = realloc(str->buf, str->len + len+1);
if (!buf_new)
return -ENOMEM;
str->buf = buf_new;
off = str->len;
memcpy(str->buf + off, s, len);
str->len += len;
str->buf[str->len++] = '\0';
/* new node */
node_child = new(struct strbuf_node, 1);
if (!node_child)
return -ENOMEM;
*node_child = (struct strbuf_node) {
.value_off = off,
.value_len = len,
};
/* extend array, add new entry, sort for bisection */
child = reallocarray(node->children, node->children_count + 1, sizeof(struct strbuf_child_entry));
if (!child) {
free(node_child);
return -ENOMEM;
}
str->nodes_count++;
node->children = child;
bubbleinsert(node, c, node_child);
return off;
}