1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <errno.h>
4 #include <stddef.h>
5
6 #include "macro.h"
7 #include "string-util.h"
8 #include "xml.h"
9
10 enum {
11 STATE_NULL,
12 STATE_TEXT,
13 STATE_TAG,
14 STATE_ATTRIBUTE,
15 };
16
inc_lines(unsigned * line,const char * s,size_t n)17 static void inc_lines(unsigned *line, const char *s, size_t n) {
18 const char *p = s;
19
20 if (!line)
21 return;
22
23 for (;;) {
24 const char *f;
25
26 f = memchr(p, '\n', n);
27 if (!f)
28 return;
29
30 n -= (f - p) + 1;
31 p = f + 1;
32 (*line)++;
33 }
34 }
35
36 /* We don't actually do real XML here. We only read a simplistic
37 * subset, that is a bit less strict that XML and lacks all the more
38 * complex features, like entities, or namespaces. However, we do
39 * support some HTML5-like simplifications */
40
xml_tokenize(const char ** p,char ** name,void ** state,unsigned * line)41 int xml_tokenize(const char **p, char **name, void **state, unsigned *line) {
42 const char *c, *e, *b;
43 char *ret;
44 int t;
45
46 assert(p);
47 assert(*p);
48 assert(name);
49 assert(state);
50
51 t = PTR_TO_INT(*state);
52 c = *p;
53
54 if (t == STATE_NULL) {
55 if (line)
56 *line = 1;
57 t = STATE_TEXT;
58 }
59
60 for (;;) {
61 if (*c == 0)
62 return XML_END;
63
64 switch (t) {
65
66 case STATE_TEXT: {
67 int x;
68
69 e = strchrnul(c, '<');
70 if (e > c) {
71 /* More text... */
72 ret = strndup(c, e - c);
73 if (!ret)
74 return -ENOMEM;
75
76 inc_lines(line, c, e - c);
77
78 *name = ret;
79 *p = e;
80 *state = INT_TO_PTR(STATE_TEXT);
81
82 return XML_TEXT;
83 }
84
85 assert(*e == '<');
86 b = c + 1;
87
88 if (startswith(b, "!--")) {
89 /* A comment */
90 e = strstr(b + 3, "-->");
91 if (!e)
92 return -EINVAL;
93
94 inc_lines(line, b, e + 3 - b);
95
96 c = e + 3;
97 continue;
98 }
99
100 if (*b == '?') {
101 /* Processing instruction */
102
103 e = strstr(b + 1, "?>");
104 if (!e)
105 return -EINVAL;
106
107 inc_lines(line, b, e + 2 - b);
108
109 c = e + 2;
110 continue;
111 }
112
113 if (*b == '!') {
114 /* DTD */
115
116 e = strchr(b + 1, '>');
117 if (!e)
118 return -EINVAL;
119
120 inc_lines(line, b, e + 1 - b);
121
122 c = e + 1;
123 continue;
124 }
125
126 if (*b == '/') {
127 /* A closing tag */
128 x = XML_TAG_CLOSE;
129 b++;
130 } else
131 x = XML_TAG_OPEN;
132
133 e = strpbrk(b, WHITESPACE "/>");
134 if (!e)
135 return -EINVAL;
136
137 ret = strndup(b, e - b);
138 if (!ret)
139 return -ENOMEM;
140
141 *name = ret;
142 *p = e;
143 *state = INT_TO_PTR(STATE_TAG);
144
145 return x;
146 }
147
148 case STATE_TAG:
149
150 b = c + strspn(c, WHITESPACE);
151 if (*b == 0)
152 return -EINVAL;
153
154 inc_lines(line, c, b - c);
155
156 e = b + strcspn(b, WHITESPACE "=/>");
157 if (e > b) {
158 /* An attribute */
159
160 ret = strndup(b, e - b);
161 if (!ret)
162 return -ENOMEM;
163
164 *name = ret;
165 *p = e;
166 *state = INT_TO_PTR(STATE_ATTRIBUTE);
167
168 return XML_ATTRIBUTE_NAME;
169 }
170
171 if (startswith(b, "/>")) {
172 /* An empty tag */
173
174 *name = NULL; /* For empty tags we return a NULL name, the caller must be prepared for that */
175 *p = b + 2;
176 *state = INT_TO_PTR(STATE_TEXT);
177
178 return XML_TAG_CLOSE_EMPTY;
179 }
180
181 if (*b != '>')
182 return -EINVAL;
183
184 c = b + 1;
185 t = STATE_TEXT;
186 continue;
187
188 case STATE_ATTRIBUTE:
189
190 if (*c == '=') {
191 c++;
192
193 if (IN_SET(*c, '\'', '"')) {
194 /* Tag with a quoted value */
195
196 e = strchr(c+1, *c);
197 if (!e)
198 return -EINVAL;
199
200 inc_lines(line, c, e - c);
201
202 ret = strndup(c+1, e - c - 1);
203 if (!ret)
204 return -ENOMEM;
205
206 *name = ret;
207 *p = e + 1;
208 *state = INT_TO_PTR(STATE_TAG);
209
210 return XML_ATTRIBUTE_VALUE;
211
212 }
213
214 /* Tag with a value without quotes */
215
216 b = strpbrk(c, WHITESPACE ">");
217 if (!b)
218 b = c;
219
220 ret = strndup(c, b - c);
221 if (!ret)
222 return -ENOMEM;
223
224 *name = ret;
225 *p = b;
226 *state = INT_TO_PTR(STATE_TAG);
227 return XML_ATTRIBUTE_VALUE;
228 }
229
230 t = STATE_TAG;
231 continue;
232 }
233
234 }
235
236 assert_not_reached();
237 }
238