1 | #include <string.h> |
2 | #include "url.h" |
3 | |
4 | /** |
5 | * URI Schemes |
6 | * http://en.wikipedia.org/wiki/URI_scheme |
7 | */ |
8 | |
9 | static const char *URL_SCHEMES[] = { |
10 | // official IANA registered schemes |
11 | "aaa" , "aaas" , "about" , "acap" , "acct" , "adiumxtra" , "afp" , "afs" , "aim" , "apt" , "attachment" , "aw" , |
12 | "beshare" , "bitcoin" , "bolo" , "callto" , "cap" , "chrome" , "crome-extension" , "com-evenbrite-attendee" , |
13 | "cid" , "coap" , "coaps" ,"content" , "crid" , "cvs" , "data" , "dav" , "dict" , "lna-playsingle" , "dln-playcontainer" , |
14 | "dns" , "dtn" , "dvb" , "ed2k" , "facetime" , "fax" , "feed" , "file" , "finger" , "fish" ,"ftp" , "geo" , "gg" ,"git" , |
15 | "gizmoproject" , "go" , "gopher" , "gtalk" , "h323" , "hcp" , "http" , "https" , "iax" , "icap" , "icon" ,"im" , |
16 | "imap" , "info" , "ipn" , "ipp" , "irc" , "irc6" , "ircs" , "iris" , "iris.beep" , "iris.xpc" , "iris.xpcs" ,"iris.lws" , |
17 | "itms" , "jabber" , "jar" , "jms" , "keyparc" , "lastfm" , "ldap" , "ldaps" , "magnet" , "mailserver" ,"mailto" , |
18 | "maps" , "market" , "message" , "mid" , "mms" , "modem" , "ms-help" , "mssettings-power" , "msnim" , "msrp" , |
19 | "msrps" , "mtqp" , "mumble" , "mupdate" , "mvn" , "news" , "nfs" , "ni" , "nih" , "nntp" , "notes" ,"oid" , |
20 | "paquelocktoken" , "pack" , "palm" , "paparazzi" , "pkcs11" , "platform" , "pop" , "pres" , "prospero" , "proxy" , |
21 | "psyc" ,"query" , "reload" , "res" , "resource" , "rmi" , "rsync" , "rtmp" ,"rtsp" , "secondlife" , "service" ,"session" , |
22 | "sftp" , "sgn" , "shttp" , "sieve" , "sip" , "sips" , "skype" , "smb" , "sms" , "snews" , "snmp" , "soap.beep" ,"soap.beeps" , |
23 | "soldat" , "spotify" , "ssh" , "steam" , "svn" , "tag" , "teamspeak" , "tel" , "telnet" , "tftp" , "things" ,"thismessage" , |
24 | "tn3270" , "tip" , "tv" , "udp" , "unreal" , "urn" , "ut2004" , "vemmi" ,"ventrilo" , "videotex" , "view-source" , "wais" ,"webcal" , |
25 | "ws" , "wss" , "wtai" , "wyciwyg" , "xcon" , "xcon-userid" , "xfire" ,"xmlrpc.beep" , "xmlrpc.beeps" , "xmpp" , "xri" ,"ymsgr" , |
26 | |
27 | // unofficial schemes |
28 | "javascript" , "jdbc" , "doi" |
29 | }; |
30 | |
31 | #ifndef HAVE_STRDUP |
32 | // non C99 standard functions |
33 | #if __POSIX_C_SOURCE__ < 200809L |
34 | char * |
35 | strdup (const char *str) { |
36 | int n = strlen(str) + 1; |
37 | char *dup = (char *) malloc(n); |
38 | if (dup) strcpy(dup, str); |
39 | return dup; |
40 | } |
41 | #endif |
42 | #endif |
43 | |
44 | |
45 | static char * |
46 | strff (const char* ptr, size_t n) { |
47 | for (size_t i = 0; i < n; ++i) { |
48 | (void) *ptr++; |
49 | } |
50 | |
51 | return strdup(ptr); |
52 | } |
53 | |
54 | |
55 | static char * |
56 | get_part (const char* url, const char *format, int l) { |
57 | bool has = false; |
58 | char *tmp = strdup(url); |
59 | char *tmp_url = strdup(url); |
60 | char *fmt_url = strdup(url); |
61 | char *ret = NULL; |
62 | |
63 | if (!tmp || !tmp_url || !fmt_url) |
64 | return NULL; |
65 | |
66 | strcpy(tmp, "" ); |
67 | strcpy(fmt_url, "" ); |
68 | |
69 | // move pointer exactly the amount |
70 | // of characters in the `prototcol` char |
71 | // plus 3 characters that represent the `://` |
72 | // part of the url |
73 | char* fmt_url_new = strff(fmt_url, l); |
74 | free(fmt_url); |
75 | fmt_url = fmt_url_new; |
76 | |
77 | sscanf(fmt_url, format, tmp); |
78 | |
79 | if (0 != strcmp(tmp, tmp_url)) { |
80 | has = true; |
81 | ret = strdup(tmp); |
82 | } |
83 | |
84 | free(tmp); |
85 | free(tmp_url); |
86 | free(fmt_url); |
87 | |
88 | return has? ret : NULL; |
89 | } |
90 | |
91 | url_data_t* |
92 | url_parse (char* url) { |
93 | url_data_t *data = (url_data_t *) calloc(1, sizeof(url_data_t)); |
94 | if (!data) return NULL; |
95 | |
96 | data->href = url; |
97 | char *tmp_url = strdup(url); |
98 | |
99 | char *protocol = url_get_protocol(tmp_url); |
100 | if (!protocol) { |
101 | free(tmp_url); |
102 | free(data); |
103 | return NULL; |
104 | } |
105 | // length of protocol plus :// |
106 | const size_t protocol_len = strlen(protocol) + 3; |
107 | data->protocol = protocol; |
108 | |
109 | const bool is_ssh = url_is_ssh(protocol); |
110 | |
111 | size_t auth_len = 0; |
112 | if (strstr(tmp_url, "@" )) { |
113 | data->auth = get_part(tmp_url, "%[^@]" , protocol_len); |
114 | auth_len = strlen(data->auth); |
115 | if (data->auth) auth_len++; |
116 | } |
117 | |
118 | char *hostname = (is_ssh) |
119 | ? get_part(tmp_url, "%[^:]" , protocol_len + auth_len) |
120 | : get_part(tmp_url, "%[^/]" , protocol_len + auth_len); |
121 | |
122 | if (!hostname) { |
123 | free(tmp_url); |
124 | url_free(data); |
125 | return NULL; |
126 | } |
127 | const size_t hostname_len = strlen(hostname); |
128 | char *tmp_hostname = strdup(hostname); |
129 | data->hostname = hostname; |
130 | |
131 | char *host = (char *) malloc((strlen(tmp_hostname)+1)); |
132 | sscanf(tmp_hostname, "%[^:]" , host); |
133 | free(tmp_hostname); |
134 | if (!host) { |
135 | free(tmp_url); |
136 | url_free(data); |
137 | return NULL; |
138 | } |
139 | data->host = host; |
140 | |
141 | const size_t host_len = strlen(host); |
142 | if (hostname_len > host_len) { |
143 | data->port = strff(hostname, host_len + 1); // +1 for ':' char; |
144 | } else { |
145 | data->port = NULL; |
146 | } |
147 | |
148 | char *tmp_path = (is_ssh) |
149 | ? get_part(tmp_url, ":%s" , protocol_len + auth_len + hostname_len) |
150 | : get_part(tmp_url, "/%s" , protocol_len + auth_len + hostname_len); |
151 | |
152 | char *path = (char *) malloc((strlen(tmp_path) + 2)); |
153 | if (!path) { |
154 | free(tmp_url); |
155 | url_free(data); |
156 | return NULL; |
157 | } |
158 | const char *fmt = (is_ssh)? "%s" : "/%s" ; |
159 | sprintf(path, fmt, tmp_path); |
160 | data->path = path; |
161 | |
162 | char *pathname = (char *) malloc((strlen(tmp_path) + 2)); |
163 | free(tmp_path); |
164 | if (!pathname) { |
165 | free(tmp_url); |
166 | url_free(data); |
167 | return NULL; |
168 | } |
169 | strcat(pathname, "" ); |
170 | tmp_path = strdup(path); |
171 | sscanf(tmp_path, "%[^? | ^#]" , pathname); |
172 | const size_t pathname_len = strlen(pathname); |
173 | data->pathname = pathname; |
174 | |
175 | char* tmp_path_new = strff(tmp_path, pathname_len); |
176 | free(tmp_path); |
177 | tmp_path = tmp_path_new; |
178 | char* search = NULL; |
179 | sscanf(tmp_path, "%m[^#]" , &search); |
180 | data->search = search; |
181 | |
182 | const size_t search_len = search ? strlen(search) : 0; |
183 | free(tmp_path); |
184 | |
185 | if(search) { |
186 | char* query = NULL; |
187 | sscanf(search, "?%ms" , &query); |
188 | data->query = query; |
189 | } |
190 | |
191 | char* hash = NULL; |
192 | tmp_path = strff(path, pathname_len + search_len); |
193 | sscanf(tmp_path, "%ms" , &hash); |
194 | data->hash = hash; |
195 | free(tmp_path); |
196 | free(tmp_url); |
197 | |
198 | return data; |
199 | } |
200 | |
201 | bool |
202 | url_is_protocol (const char* str) { |
203 | const unsigned count = sizeof(URL_SCHEMES) / sizeof(URL_SCHEMES[0]); |
204 | |
205 | for (unsigned i = 0; i < count; ++i) { |
206 | if (0 == strcmp(URL_SCHEMES[i], str)) { |
207 | return true; |
208 | } |
209 | } |
210 | |
211 | return false; |
212 | } |
213 | |
214 | bool |
215 | url_is_ssh (const char* str) { |
216 | if (0 == strcmp(str, "ssh" ) || 0 == strcmp(str, "git" )) { |
217 | return true; |
218 | } |
219 | return false; |
220 | } |
221 | |
222 | char * |
223 | url_get_protocol (const char* url) { |
224 | char *protocol = (char *) malloc(URL_PROTOCOL_MAX_LENGTH); |
225 | if (!protocol) return NULL; |
226 | |
227 | sscanf(url, "%[^://]" , protocol); |
228 | if (url_is_protocol(protocol)) return protocol; |
229 | |
230 | free(protocol); |
231 | return NULL; |
232 | } |
233 | |
234 | |
235 | char * |
236 | url_get_auth (const char* url) { |
237 | char *protocol = url_get_protocol(url); |
238 | if (!protocol) return NULL; |
239 | const size_t l = strlen(protocol) + 3; |
240 | free(protocol); |
241 | return get_part(url, "%[^@]" , l); |
242 | } |
243 | |
244 | char * |
245 | url_get_hostname (const char* url) { |
246 | size_t l = 3; |
247 | char *protocol = url_get_protocol(url); |
248 | char *tmp_protocol = strdup(protocol); |
249 | char *auth = url_get_auth(url); |
250 | |
251 | if (!protocol) return NULL; |
252 | if (auth) { |
253 | l += strlen(auth) + 1; // add one @ symbol |
254 | free(auth); |
255 | } |
256 | |
257 | l += strlen(protocol); |
258 | |
259 | free(protocol); |
260 | |
261 | char * hostname = url_is_ssh(tmp_protocol) |
262 | ? get_part(url, "%[^:]" , l) |
263 | : get_part(url, "%[^/]" , l); |
264 | free(tmp_protocol); |
265 | return hostname; |
266 | } |
267 | |
268 | char * |
269 | url_get_host (const char* url) { |
270 | char *host = NULL; |
271 | char *hostname = url_get_hostname(url); |
272 | |
273 | if (!hostname) return NULL; |
274 | |
275 | sscanf(hostname, "%m[^:]" , &host); |
276 | |
277 | free(hostname); |
278 | |
279 | return host; |
280 | } |
281 | |
282 | char * |
283 | url_get_pathname (const char* url) { |
284 | char *path = url_get_path(url); |
285 | char *pathname = NULL; |
286 | |
287 | if (!path) return NULL; |
288 | |
289 | sscanf(path, "%m[^?]" , &pathname); |
290 | |
291 | free(path); |
292 | |
293 | return pathname; |
294 | } |
295 | |
296 | char * |
297 | url_get_path (const char* url) { |
298 | size_t l = 3; |
299 | char *protocol = url_get_protocol(url); |
300 | char *auth = url_get_auth(url); |
301 | char *hostname = url_get_hostname(url); |
302 | |
303 | |
304 | if (!protocol || !hostname) |
305 | return NULL; |
306 | |
307 | const bool is_ssh = url_is_ssh(protocol); |
308 | |
309 | l += strlen(protocol) + strlen(hostname); |
310 | |
311 | if (auth) l+= strlen(auth) +1; // @ symbol |
312 | |
313 | char* tmp_path = (is_ssh) |
314 | ? get_part(url, ":%s" , l) |
315 | : get_part(url, "/%s" , l); |
316 | |
317 | const char *fmt = (is_ssh)? "%s" : "/%s" ; |
318 | char *path = (char *) malloc(strlen(tmp_path)+2); |
319 | sprintf(path, fmt, tmp_path); |
320 | |
321 | free(auth); |
322 | free(protocol); |
323 | free(hostname); |
324 | free(tmp_path); |
325 | |
326 | return path; |
327 | } |
328 | |
329 | char * |
330 | url_get_search (const char* url) { |
331 | char *path = url_get_path(url); |
332 | char *pathname = url_get_pathname(url); |
333 | |
334 | if (!path) return NULL; |
335 | |
336 | char *search = NULL; |
337 | sscanf(path + strlen(pathname), "%m[^#]" , &search); |
338 | |
339 | free(path); |
340 | free(pathname); |
341 | |
342 | return search; |
343 | } |
344 | |
345 | char * |
346 | url_get_query (const char* url) { |
347 | char *search = url_get_search(url); |
348 | char *query = NULL; |
349 | if (!search) return NULL; |
350 | |
351 | sscanf(search, "?%ms" , &query); |
352 | free(search); |
353 | return query; |
354 | } |
355 | |
356 | char * |
357 | url_get_hash (const char* url) { |
358 | char *path = url_get_path(url); |
359 | if (!path) return NULL; |
360 | |
361 | char *pathname = url_get_pathname(url); |
362 | if (!pathname) { |
363 | free(path); |
364 | return NULL; |
365 | } |
366 | |
367 | char *search = url_get_search(url); |
368 | |
369 | const size_t pathname_len = strlen(pathname); |
370 | const size_t search_len = strlen(search); |
371 | char *tmp_path = strff(path, pathname_len + search_len); |
372 | |
373 | char* hash = NULL; |
374 | sscanf(tmp_path, "%ms" , &hash); |
375 | // tmp_path = strrwd(tmp_path, pathname_len + search_len); |
376 | free(tmp_path); |
377 | free(pathname); |
378 | free(path); |
379 | free(search); |
380 | |
381 | return hash; |
382 | } |
383 | |
384 | char * |
385 | url_get_port (const char* url) { |
386 | char *port = NULL; |
387 | char *hostname = url_get_hostname(url); |
388 | char *host = url_get_host(url); |
389 | if (!hostname) return NULL; |
390 | |
391 | sscanf(hostname + strlen(host) + 1, "%ms" , &port); |
392 | free(hostname); |
393 | free(host); |
394 | return port; |
395 | } |
396 | |
397 | void |
398 | url_inspect (char* url) { |
399 | url_data_inspect(url_parse(url)); |
400 | } |
401 | |
402 | |
403 | #define PRINT_MEMBER(member) do{ \ |
404 | if(data->member) \ |
405 | printf(" ." #member ": \"%s\"\n", data->member); \ |
406 | else \ |
407 | printf(" ." #member ": (NULL)\n"); \ |
408 | }while(0) |
409 | |
410 | void |
411 | url_data_inspect (const url_data_t* data) { |
412 | printf("#url =>\n" ); |
413 | PRINT_MEMBER(href); |
414 | PRINT_MEMBER(protocol); |
415 | PRINT_MEMBER(host); |
416 | PRINT_MEMBER(auth); |
417 | PRINT_MEMBER(hostname); |
418 | PRINT_MEMBER(pathname); |
419 | PRINT_MEMBER(search); |
420 | PRINT_MEMBER(path); |
421 | PRINT_MEMBER(hash); |
422 | PRINT_MEMBER(query); |
423 | PRINT_MEMBER(port); |
424 | } |
425 | |
426 | void |
427 | url_free (url_data_t *data) { |
428 | if (!data) return; |
429 | free(data->auth); |
430 | free(data->protocol); |
431 | free(data->hostname); |
432 | free(data->host); |
433 | free(data->pathname); |
434 | free(data->path); |
435 | free(data->hash); |
436 | free(data->port); |
437 | free(data->search); |
438 | free(data->query); |
439 | free(data); |
440 | } |
441 | |