| 1 | #include <string.h> |
| 2 | #include "url.h" |
| 3 | |
| 4 | /** |
| 5 | * URI Schemes |
| 6 | * http://en.wikipedia.org/wiki/URI_scheme |
| 7 | */ |
| 8 | |
| 9 | static const char *URL_SCHEMES[] = { |
| 10 | // official IANA registered schemes |
| 11 | "aaa" , "aaas" , "about" , "acap" , "acct" , "adiumxtra" , "afp" , "afs" , "aim" , "apt" , "attachment" , "aw" , |
| 12 | "beshare" , "bitcoin" , "bolo" , "callto" , "cap" , "chrome" , "crome-extension" , "com-evenbrite-attendee" , |
| 13 | "cid" , "coap" , "coaps" ,"content" , "crid" , "cvs" , "data" , "dav" , "dict" , "lna-playsingle" , "dln-playcontainer" , |
| 14 | "dns" , "dtn" , "dvb" , "ed2k" , "facetime" , "fax" , "feed" , "file" , "finger" , "fish" ,"ftp" , "geo" , "gg" ,"git" , |
| 15 | "gizmoproject" , "go" , "gopher" , "gtalk" , "h323" , "hcp" , "http" , "https" , "iax" , "icap" , "icon" ,"im" , |
| 16 | "imap" , "info" , "ipn" , "ipp" , "irc" , "irc6" , "ircs" , "iris" , "iris.beep" , "iris.xpc" , "iris.xpcs" ,"iris.lws" , |
| 17 | "itms" , "jabber" , "jar" , "jms" , "keyparc" , "lastfm" , "ldap" , "ldaps" , "magnet" , "mailserver" ,"mailto" , |
| 18 | "maps" , "market" , "message" , "mid" , "mms" , "modem" , "ms-help" , "mssettings-power" , "msnim" , "msrp" , |
| 19 | "msrps" , "mtqp" , "mumble" , "mupdate" , "mvn" , "news" , "nfs" , "ni" , "nih" , "nntp" , "notes" ,"oid" , |
| 20 | "paquelocktoken" , "pack" , "palm" , "paparazzi" , "pkcs11" , "platform" , "pop" , "pres" , "prospero" , "proxy" , |
| 21 | "psyc" ,"query" , "reload" , "res" , "resource" , "rmi" , "rsync" , "rtmp" ,"rtsp" , "secondlife" , "service" ,"session" , |
| 22 | "sftp" , "sgn" , "shttp" , "sieve" , "sip" , "sips" , "skype" , "smb" , "sms" , "snews" , "snmp" , "soap.beep" ,"soap.beeps" , |
| 23 | "soldat" , "spotify" , "ssh" , "steam" , "svn" , "tag" , "teamspeak" , "tel" , "telnet" , "tftp" , "things" ,"thismessage" , |
| 24 | "tn3270" , "tip" , "tv" , "udp" , "unreal" , "urn" , "ut2004" , "vemmi" ,"ventrilo" , "videotex" , "view-source" , "wais" ,"webcal" , |
| 25 | "ws" , "wss" , "wtai" , "wyciwyg" , "xcon" , "xcon-userid" , "xfire" ,"xmlrpc.beep" , "xmlrpc.beeps" , "xmpp" , "xri" ,"ymsgr" , |
| 26 | |
| 27 | // unofficial schemes |
| 28 | "javascript" , "jdbc" , "doi" |
| 29 | }; |
| 30 | |
| 31 | #ifndef HAVE_STRDUP |
| 32 | // non C99 standard functions |
| 33 | #if __POSIX_C_SOURCE__ < 200809L |
| 34 | char * |
| 35 | strdup (const char *str) { |
| 36 | int n = strlen(str) + 1; |
| 37 | char *dup = (char *) malloc(n); |
| 38 | if (dup) strcpy(dup, str); |
| 39 | return dup; |
| 40 | } |
| 41 | #endif |
| 42 | #endif |
| 43 | |
| 44 | |
| 45 | static char * |
| 46 | strff (const char* ptr, size_t n) { |
| 47 | for (size_t i = 0; i < n; ++i) { |
| 48 | (void) *ptr++; |
| 49 | } |
| 50 | |
| 51 | return strdup(ptr); |
| 52 | } |
| 53 | |
| 54 | |
| 55 | static char * |
| 56 | get_part (const char* url, const char *format, int l) { |
| 57 | bool has = false; |
| 58 | char *tmp = strdup(url); |
| 59 | char *tmp_url = strdup(url); |
| 60 | char *fmt_url = strdup(url); |
| 61 | char *ret = NULL; |
| 62 | |
| 63 | if (!tmp || !tmp_url || !fmt_url) |
| 64 | return NULL; |
| 65 | |
| 66 | strcpy(tmp, "" ); |
| 67 | strcpy(fmt_url, "" ); |
| 68 | |
| 69 | // move pointer exactly the amount |
| 70 | // of characters in the `prototcol` char |
| 71 | // plus 3 characters that represent the `://` |
| 72 | // part of the url |
| 73 | char* fmt_url_new = strff(fmt_url, l); |
| 74 | free(fmt_url); |
| 75 | fmt_url = fmt_url_new; |
| 76 | |
| 77 | sscanf(fmt_url, format, tmp); |
| 78 | |
| 79 | if (0 != strcmp(tmp, tmp_url)) { |
| 80 | has = true; |
| 81 | ret = strdup(tmp); |
| 82 | } |
| 83 | |
| 84 | free(tmp); |
| 85 | free(tmp_url); |
| 86 | free(fmt_url); |
| 87 | |
| 88 | return has? ret : NULL; |
| 89 | } |
| 90 | |
| 91 | url_data_t* |
| 92 | url_parse (char* url) { |
| 93 | url_data_t *data = (url_data_t *) calloc(1, sizeof(url_data_t)); |
| 94 | if (!data) return NULL; |
| 95 | |
| 96 | data->href = url; |
| 97 | char *tmp_url = strdup(url); |
| 98 | |
| 99 | char *protocol = url_get_protocol(tmp_url); |
| 100 | if (!protocol) { |
| 101 | free(tmp_url); |
| 102 | free(data); |
| 103 | return NULL; |
| 104 | } |
| 105 | // length of protocol plus :// |
| 106 | const size_t protocol_len = strlen(protocol) + 3; |
| 107 | data->protocol = protocol; |
| 108 | |
| 109 | const bool is_ssh = url_is_ssh(protocol); |
| 110 | |
| 111 | size_t auth_len = 0; |
| 112 | if (strstr(tmp_url, "@" )) { |
| 113 | data->auth = get_part(tmp_url, "%[^@]" , protocol_len); |
| 114 | auth_len = strlen(data->auth); |
| 115 | if (data->auth) auth_len++; |
| 116 | } |
| 117 | |
| 118 | char *hostname = (is_ssh) |
| 119 | ? get_part(tmp_url, "%[^:]" , protocol_len + auth_len) |
| 120 | : get_part(tmp_url, "%[^/]" , protocol_len + auth_len); |
| 121 | |
| 122 | if (!hostname) { |
| 123 | free(tmp_url); |
| 124 | url_free(data); |
| 125 | return NULL; |
| 126 | } |
| 127 | const size_t hostname_len = strlen(hostname); |
| 128 | char *tmp_hostname = strdup(hostname); |
| 129 | data->hostname = hostname; |
| 130 | |
| 131 | char *host = (char *) malloc((strlen(tmp_hostname)+1)); |
| 132 | sscanf(tmp_hostname, "%[^:]" , host); |
| 133 | free(tmp_hostname); |
| 134 | if (!host) { |
| 135 | free(tmp_url); |
| 136 | url_free(data); |
| 137 | return NULL; |
| 138 | } |
| 139 | data->host = host; |
| 140 | |
| 141 | const size_t host_len = strlen(host); |
| 142 | if (hostname_len > host_len) { |
| 143 | data->port = strff(hostname, host_len + 1); // +1 for ':' char; |
| 144 | } else { |
| 145 | data->port = NULL; |
| 146 | } |
| 147 | |
| 148 | char *tmp_path = (is_ssh) |
| 149 | ? get_part(tmp_url, ":%s" , protocol_len + auth_len + hostname_len) |
| 150 | : get_part(tmp_url, "/%s" , protocol_len + auth_len + hostname_len); |
| 151 | |
| 152 | char *path = (char *) malloc((strlen(tmp_path) + 2)); |
| 153 | if (!path) { |
| 154 | free(tmp_url); |
| 155 | url_free(data); |
| 156 | return NULL; |
| 157 | } |
| 158 | const char *fmt = (is_ssh)? "%s" : "/%s" ; |
| 159 | sprintf(path, fmt, tmp_path); |
| 160 | data->path = path; |
| 161 | |
| 162 | char *pathname = (char *) malloc((strlen(tmp_path) + 2)); |
| 163 | free(tmp_path); |
| 164 | if (!pathname) { |
| 165 | free(tmp_url); |
| 166 | url_free(data); |
| 167 | return NULL; |
| 168 | } |
| 169 | strcat(pathname, "" ); |
| 170 | tmp_path = strdup(path); |
| 171 | sscanf(tmp_path, "%[^? | ^#]" , pathname); |
| 172 | const size_t pathname_len = strlen(pathname); |
| 173 | data->pathname = pathname; |
| 174 | |
| 175 | char* tmp_path_new = strff(tmp_path, pathname_len); |
| 176 | free(tmp_path); |
| 177 | tmp_path = tmp_path_new; |
| 178 | char* search = NULL; |
| 179 | sscanf(tmp_path, "%m[^#]" , &search); |
| 180 | data->search = search; |
| 181 | |
| 182 | const size_t search_len = search ? strlen(search) : 0; |
| 183 | free(tmp_path); |
| 184 | |
| 185 | if(search) { |
| 186 | char* query = NULL; |
| 187 | sscanf(search, "?%ms" , &query); |
| 188 | data->query = query; |
| 189 | } |
| 190 | |
| 191 | char* hash = NULL; |
| 192 | tmp_path = strff(path, pathname_len + search_len); |
| 193 | sscanf(tmp_path, "%ms" , &hash); |
| 194 | data->hash = hash; |
| 195 | free(tmp_path); |
| 196 | free(tmp_url); |
| 197 | |
| 198 | return data; |
| 199 | } |
| 200 | |
| 201 | bool |
| 202 | url_is_protocol (const char* str) { |
| 203 | const unsigned count = sizeof(URL_SCHEMES) / sizeof(URL_SCHEMES[0]); |
| 204 | |
| 205 | for (unsigned i = 0; i < count; ++i) { |
| 206 | if (0 == strcmp(URL_SCHEMES[i], str)) { |
| 207 | return true; |
| 208 | } |
| 209 | } |
| 210 | |
| 211 | return false; |
| 212 | } |
| 213 | |
| 214 | bool |
| 215 | url_is_ssh (const char* str) { |
| 216 | if (0 == strcmp(str, "ssh" ) || 0 == strcmp(str, "git" )) { |
| 217 | return true; |
| 218 | } |
| 219 | return false; |
| 220 | } |
| 221 | |
| 222 | char * |
| 223 | url_get_protocol (const char* url) { |
| 224 | char *protocol = (char *) malloc(URL_PROTOCOL_MAX_LENGTH); |
| 225 | if (!protocol) return NULL; |
| 226 | |
| 227 | sscanf(url, "%[^://]" , protocol); |
| 228 | if (url_is_protocol(protocol)) return protocol; |
| 229 | |
| 230 | free(protocol); |
| 231 | return NULL; |
| 232 | } |
| 233 | |
| 234 | |
| 235 | char * |
| 236 | url_get_auth (const char* url) { |
| 237 | char *protocol = url_get_protocol(url); |
| 238 | if (!protocol) return NULL; |
| 239 | const size_t l = strlen(protocol) + 3; |
| 240 | free(protocol); |
| 241 | return get_part(url, "%[^@]" , l); |
| 242 | } |
| 243 | |
| 244 | char * |
| 245 | url_get_hostname (const char* url) { |
| 246 | size_t l = 3; |
| 247 | char *protocol = url_get_protocol(url); |
| 248 | char *tmp_protocol = strdup(protocol); |
| 249 | char *auth = url_get_auth(url); |
| 250 | |
| 251 | if (!protocol) return NULL; |
| 252 | if (auth) { |
| 253 | l += strlen(auth) + 1; // add one @ symbol |
| 254 | free(auth); |
| 255 | } |
| 256 | |
| 257 | l += strlen(protocol); |
| 258 | |
| 259 | free(protocol); |
| 260 | |
| 261 | char * hostname = url_is_ssh(tmp_protocol) |
| 262 | ? get_part(url, "%[^:]" , l) |
| 263 | : get_part(url, "%[^/]" , l); |
| 264 | free(tmp_protocol); |
| 265 | return hostname; |
| 266 | } |
| 267 | |
| 268 | char * |
| 269 | url_get_host (const char* url) { |
| 270 | char *host = NULL; |
| 271 | char *hostname = url_get_hostname(url); |
| 272 | |
| 273 | if (!hostname) return NULL; |
| 274 | |
| 275 | sscanf(hostname, "%m[^:]" , &host); |
| 276 | |
| 277 | free(hostname); |
| 278 | |
| 279 | return host; |
| 280 | } |
| 281 | |
| 282 | char * |
| 283 | url_get_pathname (const char* url) { |
| 284 | char *path = url_get_path(url); |
| 285 | char *pathname = NULL; |
| 286 | |
| 287 | if (!path) return NULL; |
| 288 | |
| 289 | sscanf(path, "%m[^?]" , &pathname); |
| 290 | |
| 291 | free(path); |
| 292 | |
| 293 | return pathname; |
| 294 | } |
| 295 | |
| 296 | char * |
| 297 | url_get_path (const char* url) { |
| 298 | size_t l = 3; |
| 299 | char *protocol = url_get_protocol(url); |
| 300 | char *auth = url_get_auth(url); |
| 301 | char *hostname = url_get_hostname(url); |
| 302 | |
| 303 | |
| 304 | if (!protocol || !hostname) |
| 305 | return NULL; |
| 306 | |
| 307 | const bool is_ssh = url_is_ssh(protocol); |
| 308 | |
| 309 | l += strlen(protocol) + strlen(hostname); |
| 310 | |
| 311 | if (auth) l+= strlen(auth) +1; // @ symbol |
| 312 | |
| 313 | char* tmp_path = (is_ssh) |
| 314 | ? get_part(url, ":%s" , l) |
| 315 | : get_part(url, "/%s" , l); |
| 316 | |
| 317 | const char *fmt = (is_ssh)? "%s" : "/%s" ; |
| 318 | char *path = (char *) malloc(strlen(tmp_path)+2); |
| 319 | sprintf(path, fmt, tmp_path); |
| 320 | |
| 321 | free(auth); |
| 322 | free(protocol); |
| 323 | free(hostname); |
| 324 | free(tmp_path); |
| 325 | |
| 326 | return path; |
| 327 | } |
| 328 | |
| 329 | char * |
| 330 | url_get_search (const char* url) { |
| 331 | char *path = url_get_path(url); |
| 332 | char *pathname = url_get_pathname(url); |
| 333 | |
| 334 | if (!path) return NULL; |
| 335 | |
| 336 | char *search = NULL; |
| 337 | sscanf(path + strlen(pathname), "%m[^#]" , &search); |
| 338 | |
| 339 | free(path); |
| 340 | free(pathname); |
| 341 | |
| 342 | return search; |
| 343 | } |
| 344 | |
| 345 | char * |
| 346 | url_get_query (const char* url) { |
| 347 | char *search = url_get_search(url); |
| 348 | char *query = NULL; |
| 349 | if (!search) return NULL; |
| 350 | |
| 351 | sscanf(search, "?%ms" , &query); |
| 352 | free(search); |
| 353 | return query; |
| 354 | } |
| 355 | |
| 356 | char * |
| 357 | url_get_hash (const char* url) { |
| 358 | char *path = url_get_path(url); |
| 359 | if (!path) return NULL; |
| 360 | |
| 361 | char *pathname = url_get_pathname(url); |
| 362 | if (!pathname) { |
| 363 | free(path); |
| 364 | return NULL; |
| 365 | } |
| 366 | |
| 367 | char *search = url_get_search(url); |
| 368 | |
| 369 | const size_t pathname_len = strlen(pathname); |
| 370 | const size_t search_len = strlen(search); |
| 371 | char *tmp_path = strff(path, pathname_len + search_len); |
| 372 | |
| 373 | char* hash = NULL; |
| 374 | sscanf(tmp_path, "%ms" , &hash); |
| 375 | // tmp_path = strrwd(tmp_path, pathname_len + search_len); |
| 376 | free(tmp_path); |
| 377 | free(pathname); |
| 378 | free(path); |
| 379 | free(search); |
| 380 | |
| 381 | return hash; |
| 382 | } |
| 383 | |
| 384 | char * |
| 385 | url_get_port (const char* url) { |
| 386 | char *port = NULL; |
| 387 | char *hostname = url_get_hostname(url); |
| 388 | char *host = url_get_host(url); |
| 389 | if (!hostname) return NULL; |
| 390 | |
| 391 | sscanf(hostname + strlen(host) + 1, "%ms" , &port); |
| 392 | free(hostname); |
| 393 | free(host); |
| 394 | return port; |
| 395 | } |
| 396 | |
| 397 | void |
| 398 | url_inspect (char* url) { |
| 399 | url_data_inspect(url_parse(url)); |
| 400 | } |
| 401 | |
| 402 | |
| 403 | #define PRINT_MEMBER(member) do{ \ |
| 404 | if(data->member) \ |
| 405 | printf(" ." #member ": \"%s\"\n", data->member); \ |
| 406 | else \ |
| 407 | printf(" ." #member ": (NULL)\n"); \ |
| 408 | }while(0) |
| 409 | |
| 410 | void |
| 411 | url_data_inspect (const url_data_t* data) { |
| 412 | printf("#url =>\n" ); |
| 413 | PRINT_MEMBER(href); |
| 414 | PRINT_MEMBER(protocol); |
| 415 | PRINT_MEMBER(host); |
| 416 | PRINT_MEMBER(auth); |
| 417 | PRINT_MEMBER(hostname); |
| 418 | PRINT_MEMBER(pathname); |
| 419 | PRINT_MEMBER(search); |
| 420 | PRINT_MEMBER(path); |
| 421 | PRINT_MEMBER(hash); |
| 422 | PRINT_MEMBER(query); |
| 423 | PRINT_MEMBER(port); |
| 424 | } |
| 425 | |
| 426 | void |
| 427 | url_free (url_data_t *data) { |
| 428 | if (!data) return; |
| 429 | free(data->auth); |
| 430 | free(data->protocol); |
| 431 | free(data->hostname); |
| 432 | free(data->host); |
| 433 | free(data->pathname); |
| 434 | free(data->path); |
| 435 | free(data->hash); |
| 436 | free(data->port); |
| 437 | free(data->search); |
| 438 | free(data->query); |
| 439 | free(data); |
| 440 | } |
| 441 | |