1#include <string.h>
2#include "url.h"
3
4/**
5 * URI Schemes
6 * http://en.wikipedia.org/wiki/URI_scheme
7 */
8
9static const char *URL_SCHEMES[] = {
10 // official IANA registered schemes
11 "aaa", "aaas", "about", "acap", "acct", "adiumxtra", "afp", "afs", "aim", "apt", "attachment", "aw",
12 "beshare", "bitcoin", "bolo", "callto", "cap", "chrome", "crome-extension", "com-evenbrite-attendee",
13 "cid", "coap", "coaps","content", "crid", "cvs", "data", "dav", "dict", "lna-playsingle", "dln-playcontainer",
14 "dns", "dtn", "dvb", "ed2k", "facetime", "fax", "feed", "file", "finger", "fish","ftp", "geo", "gg","git",
15 "gizmoproject", "go", "gopher", "gtalk", "h323", "hcp", "http", "https", "iax", "icap", "icon","im",
16 "imap", "info", "ipn", "ipp", "irc", "irc6", "ircs", "iris", "iris.beep", "iris.xpc", "iris.xpcs","iris.lws",
17 "itms", "jabber", "jar", "jms", "keyparc", "lastfm", "ldap", "ldaps", "magnet", "mailserver","mailto",
18 "maps", "market", "message", "mid", "mms", "modem", "ms-help", "mssettings-power", "msnim", "msrp",
19 "msrps", "mtqp", "mumble", "mupdate", "mvn", "news", "nfs", "ni", "nih", "nntp", "notes","oid",
20 "paquelocktoken", "pack", "palm", "paparazzi", "pkcs11", "platform", "pop", "pres", "prospero", "proxy",
21 "psyc","query", "reload", "res", "resource", "rmi", "rsync", "rtmp","rtsp", "secondlife", "service","session",
22 "sftp", "sgn", "shttp", "sieve", "sip", "sips", "skype", "smb", "sms", "snews", "snmp", "soap.beep","soap.beeps",
23 "soldat", "spotify", "ssh", "steam", "svn", "tag", "teamspeak", "tel", "telnet", "tftp", "things","thismessage",
24 "tn3270", "tip", "tv", "udp", "unreal", "urn", "ut2004", "vemmi","ventrilo", "videotex", "view-source", "wais","webcal",
25 "ws", "wss", "wtai", "wyciwyg", "xcon", "xcon-userid", "xfire","xmlrpc.beep", "xmlrpc.beeps", "xmpp", "xri","ymsgr",
26
27 // unofficial schemes
28 "javascript", "jdbc", "doi"
29};
30
31#ifndef HAVE_STRDUP
32// non C99 standard functions
33#if __POSIX_C_SOURCE__ < 200809L
34char *
35strdup (const char *str) {
36 int n = strlen(str) + 1;
37 char *dup = (char *) malloc(n);
38 if (dup) strcpy(dup, str);
39 return dup;
40}
41#endif
42#endif
43
44
45static char *
46strff (const char* ptr, size_t n) {
47 for (size_t i = 0; i < n; ++i) {
48 (void) *ptr++;
49 }
50
51 return strdup(ptr);
52}
53
54
55static char *
56get_part (const char* url, const char *format, int l) {
57 bool has = false;
58 char *tmp = strdup(url);
59 char *tmp_url = strdup(url);
60 char *fmt_url = strdup(url);
61 char *ret = NULL;
62
63 if (!tmp || !tmp_url || !fmt_url)
64 return NULL;
65
66 strcpy(tmp, "");
67 strcpy(fmt_url, "");
68
69 // move pointer exactly the amount
70 // of characters in the `prototcol` char
71 // plus 3 characters that represent the `://`
72 // part of the url
73 char* fmt_url_new = strff(fmt_url, l);
74 free(fmt_url);
75 fmt_url = fmt_url_new;
76
77 sscanf(fmt_url, format, tmp);
78
79 if (0 != strcmp(tmp, tmp_url)) {
80 has = true;
81 ret = strdup(tmp);
82 }
83
84 free(tmp);
85 free(tmp_url);
86 free(fmt_url);
87
88 return has? ret : NULL;
89}
90
91url_data_t*
92url_parse (char* url) {
93 url_data_t *data = (url_data_t *) calloc(1, sizeof(url_data_t));
94 if (!data) return NULL;
95
96 data->href = url;
97 char *tmp_url = strdup(url);
98
99 char *protocol = url_get_protocol(tmp_url);
100 if (!protocol) {
101 free(tmp_url);
102 free(data);
103 return NULL;
104 }
105 // length of protocol plus ://
106 const size_t protocol_len = strlen(protocol) + 3;
107 data->protocol = protocol;
108
109 const bool is_ssh = url_is_ssh(protocol);
110
111 size_t auth_len = 0;
112 if (strstr(tmp_url, "@")) {
113 data->auth = get_part(tmp_url, "%[^@]", protocol_len);
114 auth_len = strlen(data->auth);
115 if (data->auth) auth_len++;
116 }
117
118 char *hostname = (is_ssh)
119 ? get_part(tmp_url, "%[^:]", protocol_len + auth_len)
120 : get_part(tmp_url, "%[^/]", protocol_len + auth_len);
121
122 if (!hostname) {
123 free(tmp_url);
124 url_free(data);
125 return NULL;
126 }
127 const size_t hostname_len = strlen(hostname);
128 char *tmp_hostname = strdup(hostname);
129 data->hostname = hostname;
130
131 char *host = (char *) malloc((strlen(tmp_hostname)+1));
132 sscanf(tmp_hostname, "%[^:]", host);
133 free(tmp_hostname);
134 if (!host) {
135 free(tmp_url);
136 url_free(data);
137 return NULL;
138 }
139 data->host = host;
140
141 const size_t host_len = strlen(host);
142 if (hostname_len > host_len) {
143 data->port = strff(hostname, host_len + 1); // +1 for ':' char;
144 } else {
145 data->port = NULL;
146 }
147
148 char *tmp_path = (is_ssh)
149 ? get_part(tmp_url, ":%s", protocol_len + auth_len + hostname_len)
150 : get_part(tmp_url, "/%s", protocol_len + auth_len + hostname_len);
151
152 char *path = (char *) malloc((strlen(tmp_path) + 2));
153 if (!path) {
154 free(tmp_url);
155 url_free(data);
156 return NULL;
157 }
158 const char *fmt = (is_ssh)? "%s" : "/%s";
159 sprintf(path, fmt, tmp_path);
160 data->path = path;
161
162 char *pathname = (char *) malloc((strlen(tmp_path) + 2));
163 free(tmp_path);
164 if (!pathname) {
165 free(tmp_url);
166 url_free(data);
167 return NULL;
168 }
169 strcat(pathname, "");
170 tmp_path = strdup(path);
171 sscanf(tmp_path, "%[^? | ^#]", pathname);
172 const size_t pathname_len = strlen(pathname);
173 data->pathname = pathname;
174
175 char* tmp_path_new = strff(tmp_path, pathname_len);
176 free(tmp_path);
177 tmp_path = tmp_path_new;
178 char* search = NULL;
179 sscanf(tmp_path, "%m[^#]", &search);
180 data->search = search;
181
182 const size_t search_len = search ? strlen(search) : 0;
183 free(tmp_path);
184
185 if(search) {
186 char* query = NULL;
187 sscanf(search, "?%ms", &query);
188 data->query = query;
189 }
190
191 char* hash = NULL;
192 tmp_path = strff(path, pathname_len + search_len);
193 sscanf(tmp_path, "%ms", &hash);
194 data->hash = hash;
195 free(tmp_path);
196 free(tmp_url);
197
198 return data;
199}
200
201bool
202url_is_protocol (const char* str) {
203 const unsigned count = sizeof(URL_SCHEMES) / sizeof(URL_SCHEMES[0]);
204
205 for (unsigned i = 0; i < count; ++i) {
206 if (0 == strcmp(URL_SCHEMES[i], str)) {
207 return true;
208 }
209 }
210
211 return false;
212}
213
214bool
215url_is_ssh (const char* str) {
216 if (0 == strcmp(str, "ssh") || 0 == strcmp(str, "git")) {
217 return true;
218 }
219 return false;
220}
221
222char *
223url_get_protocol (const char* url) {
224 char *protocol = (char *) malloc(URL_PROTOCOL_MAX_LENGTH);
225 if (!protocol) return NULL;
226
227 sscanf(url, "%[^://]", protocol);
228 if (url_is_protocol(protocol)) return protocol;
229
230 free(protocol);
231 return NULL;
232}
233
234
235char *
236url_get_auth (const char* url) {
237 char *protocol = url_get_protocol(url);
238 if (!protocol) return NULL;
239 const size_t l = strlen(protocol) + 3;
240 free(protocol);
241 return get_part(url, "%[^@]", l);
242}
243
244char *
245url_get_hostname (const char* url) {
246 size_t l = 3;
247 char *protocol = url_get_protocol(url);
248 char *tmp_protocol = strdup(protocol);
249 char *auth = url_get_auth(url);
250
251 if (!protocol) return NULL;
252 if (auth) {
253 l += strlen(auth) + 1; // add one @ symbol
254 free(auth);
255 }
256
257 l += strlen(protocol);
258
259 free(protocol);
260
261 char * hostname = url_is_ssh(tmp_protocol)
262 ? get_part(url, "%[^:]", l)
263 : get_part(url, "%[^/]", l);
264 free(tmp_protocol);
265 return hostname;
266}
267
268char *
269url_get_host (const char* url) {
270 char *host = NULL;
271 char *hostname = url_get_hostname(url);
272
273 if (!hostname) return NULL;
274
275 sscanf(hostname, "%m[^:]", &host);
276
277 free(hostname);
278
279 return host;
280}
281
282char *
283url_get_pathname (const char* url) {
284 char *path = url_get_path(url);
285 char *pathname = NULL;
286
287 if (!path) return NULL;
288
289 sscanf(path, "%m[^?]", &pathname);
290
291 free(path);
292
293 return pathname;
294}
295
296char *
297url_get_path (const char* url) {
298 size_t l = 3;
299 char *protocol = url_get_protocol(url);
300 char *auth = url_get_auth(url);
301 char *hostname = url_get_hostname(url);
302
303
304 if (!protocol || !hostname)
305 return NULL;
306
307 const bool is_ssh = url_is_ssh(protocol);
308
309 l += strlen(protocol) + strlen(hostname);
310
311 if (auth) l+= strlen(auth) +1; // @ symbol
312
313 char* tmp_path = (is_ssh)
314 ? get_part(url, ":%s", l)
315 : get_part(url, "/%s", l);
316
317 const char *fmt = (is_ssh)? "%s" : "/%s";
318 char *path = (char *) malloc(strlen(tmp_path)+2);
319 sprintf(path, fmt, tmp_path);
320
321 free(auth);
322 free(protocol);
323 free(hostname);
324 free(tmp_path);
325
326 return path;
327}
328
329char *
330url_get_search (const char* url) {
331 char *path = url_get_path(url);
332 char *pathname = url_get_pathname(url);
333
334 if (!path) return NULL;
335
336 char *search = NULL;
337 sscanf(path + strlen(pathname), "%m[^#]", &search);
338
339 free(path);
340 free(pathname);
341
342 return search;
343}
344
345char *
346url_get_query (const char* url) {
347 char *search = url_get_search(url);
348 char *query = NULL;
349 if (!search) return NULL;
350
351 sscanf(search, "?%ms", &query);
352 free(search);
353 return query;
354}
355
356char *
357url_get_hash (const char* url) {
358 char *path = url_get_path(url);
359 if (!path) return NULL;
360
361 char *pathname = url_get_pathname(url);
362 if (!pathname) {
363 free(path);
364 return NULL;
365 }
366
367 char *search = url_get_search(url);
368
369 const size_t pathname_len = strlen(pathname);
370 const size_t search_len = strlen(search);
371 char *tmp_path = strff(path, pathname_len + search_len);
372
373 char* hash = NULL;
374 sscanf(tmp_path, "%ms", &hash);
375// tmp_path = strrwd(tmp_path, pathname_len + search_len);
376 free(tmp_path);
377 free(pathname);
378 free(path);
379 free(search);
380
381 return hash;
382}
383
384char *
385url_get_port (const char* url) {
386 char *port = NULL;
387 char *hostname = url_get_hostname(url);
388 char *host = url_get_host(url);
389 if (!hostname) return NULL;
390
391 sscanf(hostname + strlen(host) + 1, "%ms", &port);
392 free(hostname);
393 free(host);
394 return port;
395}
396
397void
398url_inspect (char* url) {
399 url_data_inspect(url_parse(url));
400}
401
402
403#define PRINT_MEMBER(member) do{ \
404 if(data->member) \
405 printf(" ." #member ": \"%s\"\n", data->member); \
406 else \
407 printf(" ." #member ": (NULL)\n"); \
408 }while(0)
409
410void
411url_data_inspect (const url_data_t* data) {
412 printf("#url =>\n");
413 PRINT_MEMBER(href);
414 PRINT_MEMBER(protocol);
415 PRINT_MEMBER(host);
416 PRINT_MEMBER(auth);
417 PRINT_MEMBER(hostname);
418 PRINT_MEMBER(pathname);
419 PRINT_MEMBER(search);
420 PRINT_MEMBER(path);
421 PRINT_MEMBER(hash);
422 PRINT_MEMBER(query);
423 PRINT_MEMBER(port);
424}
425
426void
427url_free (url_data_t *data) {
428 if (!data) return;
429 free(data->auth);
430 free(data->protocol);
431 free(data->hostname);
432 free(data->host);
433 free(data->pathname);
434 free(data->path);
435 free(data->hash);
436 free(data->port);
437 free(data->search);
438 free(data->query);
439 free(data);
440}
441