1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2 
3 #include <errno.h>
4 #include <stdlib.h>
5 #include <string.h>
6 
7 #include "alloc-util.h"
8 #include "sort-util.h"
9 #include "strbuf.h"
10 
11 /*
12  * Strbuf stores given strings in a single continuous allocated memory
13  * area. Identical strings are de-duplicated and return the same offset
14  * as the first string stored. If the tail of a string already exists
15  * in the buffer, the tail is returned.
16  *
17  * A trie (http://en.wikipedia.org/wiki/Trie) is used to maintain the
18  * information about the stored strings.
19  *
20  * Example of udev rules:
21  *   $ ./udevadm test .
22  *   ...
23  *   read rules file: /usr/lib/udev/rules.d/99-systemd.rules
24  *   rules contain 196608 bytes tokens (16384 * 12 bytes), 39742 bytes strings
25  *   23939 strings (207859 bytes), 20404 de-duplicated (171653 bytes), 3536 trie nodes used
26  *   ...
27  */
28 
strbuf_new(void)29 struct strbuf* strbuf_new(void) {
30         struct strbuf *str;
31 
32         str = new(struct strbuf, 1);
33         if (!str)
34                 return NULL;
35         *str = (struct strbuf) {
36                 .buf = new0(char, 1),
37                 .root = new0(struct strbuf_node, 1),
38                 .len = 1,
39                 .nodes_count = 1,
40         };
41         if (!str->buf || !str->root) {
42                 free(str->buf);
43                 free(str->root);
44                 return mfree(str);
45         }
46 
47         return str;
48 }
49 
strbuf_node_cleanup(struct strbuf_node * node)50 static struct strbuf_node* strbuf_node_cleanup(struct strbuf_node *node) {
51         size_t i;
52 
53         for (i = 0; i < node->children_count; i++)
54                 strbuf_node_cleanup(node->children[i].child);
55         free(node->children);
56         return mfree(node);
57 }
58 
59 /* clean up trie data, leave only the string buffer */
strbuf_complete(struct strbuf * str)60 void strbuf_complete(struct strbuf *str) {
61         if (!str)
62                 return;
63         if (str->root)
64                 str->root = strbuf_node_cleanup(str->root);
65 }
66 
67 /* clean up everything */
strbuf_free(struct strbuf * str)68 struct strbuf* strbuf_free(struct strbuf *str) {
69         if (!str)
70                 return NULL;
71 
72         strbuf_complete(str);
73         free(str->buf);
74         return mfree(str);
75 }
76 
strbuf_children_cmp(const struct strbuf_child_entry * n1,const struct strbuf_child_entry * n2)77 static int strbuf_children_cmp(const struct strbuf_child_entry *n1,
78                                const struct strbuf_child_entry *n2) {
79         return n1->c - n2->c;
80 }
81 
bubbleinsert(struct strbuf_node * node,uint8_t c,struct strbuf_node * node_child)82 static void bubbleinsert(struct strbuf_node *node,
83                          uint8_t c,
84                          struct strbuf_node *node_child) {
85 
86         struct strbuf_child_entry new = {
87                 .c = c,
88                 .child = node_child,
89         };
90         int left = 0, right = node->children_count;
91 
92         while (right > left) {
93                 int middle = (right + left) / 2 ;
94                 if (strbuf_children_cmp(&node->children[middle], &new) <= 0)
95                         left = middle + 1;
96                 else
97                         right = middle;
98         }
99 
100         memmove(node->children + left + 1, node->children + left,
101                 sizeof(struct strbuf_child_entry) * (node->children_count - left));
102         node->children[left] = new;
103 
104         node->children_count++;
105 }
106 
107 /* add string, return the index/offset into the buffer */
strbuf_add_string(struct strbuf * str,const char * s,size_t len)108 ssize_t strbuf_add_string(struct strbuf *str, const char *s, size_t len) {
109         uint8_t c;
110         char *buf_new;
111         struct strbuf_child_entry *child;
112         struct strbuf_node *node;
113         ssize_t off;
114 
115         if (!str->root)
116                 return -EINVAL;
117 
118         /* search string; start from last character to find possibly matching tails */
119 
120         str->in_count++;
121         if (len == 0) {
122                 str->dedup_count++;
123                 return 0;
124         }
125         str->in_len += len;
126 
127         node = str->root;
128         for (size_t depth = 0; depth <= len; depth++) {
129                 struct strbuf_child_entry search;
130 
131                 /* match against current node */
132                 off = node->value_off + node->value_len - len;
133                 if (depth == len || (node->value_len >= len && memcmp(str->buf + off, s, len) == 0)) {
134                         str->dedup_len += len;
135                         str->dedup_count++;
136                         return off;
137                 }
138 
139                 c = s[len - 1 - depth];
140 
141                 /* lookup child node */
142                 search.c = c;
143                 child = typesafe_bsearch(&search, node->children, node->children_count, strbuf_children_cmp);
144                 if (!child)
145                         break;
146                 node = child->child;
147         }
148 
149         /* add new string */
150         buf_new = realloc(str->buf, str->len + len+1);
151         if (!buf_new)
152                 return -ENOMEM;
153         str->buf = buf_new;
154         off = str->len;
155         memcpy(str->buf + off, s, len);
156         str->len += len;
157         str->buf[str->len++] = '\0';
158 
159         /* new node */
160         _cleanup_free_ struct strbuf_node *node_child = NULL;
161 
162         node_child = new(struct strbuf_node, 1);
163         if (!node_child)
164                 return -ENOMEM;
165         *node_child = (struct strbuf_node) {
166                 .value_off = off,
167                 .value_len = len,
168         };
169 
170         /* extend array, add new entry, sort for bisection */
171         child = reallocarray(node->children, node->children_count + 1, sizeof(struct strbuf_child_entry));
172         if (!child)
173                 return -ENOMEM;
174 
175         str->nodes_count++;
176 
177         node->children = child;
178         bubbleinsert(node, c, TAKE_PTR(node_child));
179 
180         return off;
181 }
182