-
Notifications
You must be signed in to change notification settings - Fork 602
Separate path and query in URI API #2290
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -69,6 +69,19 @@ PathChars() | |
| return pathValid; | ||
| } | ||
|
|
||
| /// Characters which are valid within a URI query section | ||
| static const CharacterSet & | ||
| QueryChars() | ||
| { | ||
| /* | ||
| * RFC 3986 section 3.4 | ||
| * | ||
| * query = *( pchar / "/" / "?" ) | ||
| */ | ||
| static const auto queryValid = CharacterSet("query", "/?") + PathChars(); | ||
| return queryValid; | ||
| } | ||
|
|
||
| /** | ||
| * Governed by RFC 3986 section 2.1 | ||
| */ | ||
|
|
@@ -208,6 +221,17 @@ AnyP::Uri::path() const | |
| return path_; | ||
| } | ||
|
|
||
| const SBuf & | ||
| AnyP::Uri::query() const | ||
| { | ||
| // FTP does not have a query section | ||
| static SBuf empty; | ||
| if (scheme_ == AnyP::PROTO_FTP) | ||
| return empty; | ||
|
|
||
| return query_; | ||
| } | ||
|
|
||
| void | ||
| urlInitialize(void) | ||
| { | ||
|
|
@@ -310,6 +334,47 @@ urlAppendDomain(char *host) | |
| return true; | ||
| } | ||
|
|
||
| /// \returns whether whitespace has 'chopped' the URL apart. | ||
| static bool | ||
| StripAnyWsp(char *buf) | ||
| { | ||
| if (stringHasWhitespace(buf)) { | ||
| switch (Config.uri_whitespace) { | ||
|
|
||
| case URI_WHITESPACE_DENY: | ||
| throw TextException("URI has whitespace", Here()); | ||
|
|
||
| case URI_WHITESPACE_ALLOW: | ||
| break; | ||
|
|
||
| case URI_WHITESPACE_ENCODE: { | ||
| auto *t = rfc1738_escape_unescaped(buf); | ||
| xstrncpy(buf, t, MAX_URL); | ||
| } | ||
| break; | ||
|
|
||
| case URI_WHITESPACE_CHOP: | ||
| *(buf + strcspn(buf, w_space)) = '\0'; | ||
| return true; | ||
|
|
||
| case URI_WHITESPACE_STRIP: | ||
| default: { | ||
| char *t = buf; | ||
| char *q = buf; | ||
| while (*t) { | ||
| if (!xisspace(*t)) { | ||
| *q = *t; | ||
| ++q; | ||
| } | ||
| ++t; | ||
| } | ||
| *q = '\0'; | ||
| } | ||
| } | ||
| } | ||
| return false; | ||
| } | ||
|
|
||
| /* | ||
| * Parse a URI/URL. | ||
| * | ||
|
|
@@ -330,14 +395,14 @@ AnyP::Uri::parse(const HttpRequestMethod& method, const SBuf &rawUrl) | |
| LOCAL_ARRAY(char, login, MAX_URL); | ||
| LOCAL_ARRAY(char, foundHost, MAX_URL); | ||
| LOCAL_ARRAY(char, urlpath, MAX_URL); | ||
| LOCAL_ARRAY(char, foundQuery, MAX_URL); | ||
| char *t = nullptr; | ||
| char *q = nullptr; | ||
| int foundPort; | ||
| int l; | ||
| int i; | ||
| const char *src; | ||
| char *dst; | ||
| foundHost[0] = urlpath[0] = login[0] = '\0'; | ||
| foundHost[0] = urlpath[0] = foundQuery[0] = login[0] = '\0'; | ||
|
|
||
| if ((l = rawUrl.length()) + Config.appendDomainLen > (MAX_URL - 1)) { | ||
| debugs(23, DBG_IMPORTANT, MYNAME << "URL too large (" << l << " bytes)"); | ||
|
|
@@ -411,23 +476,58 @@ AnyP::Uri::parse(const HttpRequestMethod& method, const SBuf &rawUrl) | |
| return false; | ||
| *dst = '\0'; | ||
|
|
||
| // We are looking at path-abempty. | ||
| if (*src != '/') { | ||
| // path-empty, including the end of the `src` c-string cases | ||
| urlpath[0] = '/'; | ||
| dst = &urlpath[1]; | ||
| } else { | ||
| bool chopped = false; | ||
| if (*src == '/') { | ||
| dst = urlpath; | ||
| } | ||
| /* Then everything from / (inclusive) until \r\n or \0 - that's urlpath */ | ||
| for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) { | ||
| *dst = *src; | ||
| /* Then everything from / (inclusive) until '?', '#', '\r\n' or '\0' - that's urlpath */ | ||
| for (; i < l && *src != '?' && *src != '#' && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) { | ||
| *dst = *src; | ||
| } | ||
| *dst = '\0'; | ||
| chopped = StripAnyWsp(urlpath); | ||
| rfc1738_unescape(urlpath); | ||
| } else { | ||
| // We should be looking at path-abempty. relative-path is not supported (yet). | ||
| urlpath[0] = '/'; | ||
| urlpath[1] = '\0'; | ||
| } | ||
|
|
||
| /* We -could- be at the end of the buffer here */ | ||
| if (i > l) | ||
| return false; | ||
| *dst = '\0'; | ||
|
|
||
| if (*src == '?') { | ||
| dst = foundQuery; | ||
| ++src; // skip '?' delimiter | ||
| ++i; // keep track of bytes 'consumed' from src | ||
| /* Then everything from '?' (exclusive) until '#', '\r\n' or '\0' - that's query */ | ||
| for (; i < l && *src != '#' && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) { | ||
| *dst = *src; | ||
| } | ||
| *dst = '\0'; | ||
| if (chopped) | ||
| *foundQuery = '\0'; | ||
| else | ||
| chopped = StripAnyWsp(foundQuery); | ||
| rfc1738_unescape(foundQuery); | ||
| } | ||
| if (i > l) | ||
| return false; | ||
|
|
||
| if (*src == '#') { | ||
| ++src; // skip '#' delimiter | ||
| ++i; // keep track of bytes 'consumed' from src | ||
| /* Then everything from '#' (exclusive) until '\r\n' or '\0' - that's fragment */ | ||
| for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src) { | ||
| ; // fragment component is not expected in network protocols. discard for now. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If official parse() code does not discard fragments, then this PR cannot use a "for now" excuse to start discarding them; most likely, this PR should not discard fragments at all, but we can discuss better excuses if really needed.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Official code calls it part of "path" component. Which is patently false. This code splits the components correctly per RFC3986. Keeping the components which are used by If you insist on retaining fragment support, I can easily add it back in. Though it would be pointless to store, since it was only actually used by |
||
| } | ||
| } | ||
|
|
||
| if (i > l) | ||
| return false; | ||
|
|
||
| if (*src != '\r' && *src != '\n' && *src != '\0') | ||
| throw TextException("invalid URL", Here()); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A parsing method that reports success/failure via its boolean return value should not throw on invalid input. The two result-reporting designs are mutually exclusive.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. um, I don't think that is correct use of the term "mutually exclusive". This method is in transition towards the state where parsing errors are thrown, and the boolean false means needs-more-data. The bits of logic where you do not see that existing are yet to be refactored away. |
||
|
|
||
| // If the parsed scheme has no (known) default port, and there is no | ||
| // explicit port, then we will reject the zero port during foundPort | ||
|
|
@@ -490,27 +590,20 @@ AnyP::Uri::parse(const HttpRequestMethod& method, const SBuf &rawUrl) | |
| *t = '\0'; | ||
| ++t; | ||
| foundPort = atoi(t); | ||
| // BUG: fails to detect non-DIGIT garbage like "http://localhost:80fubar" | ||
| } | ||
| } | ||
|
|
||
| for (t = foundHost; *t; ++t) | ||
| *t = xtolower(*t); | ||
|
|
||
| if (stringHasWhitespace(foundHost)) { | ||
| if (URI_WHITESPACE_STRIP == Config.uri_whitespace) { | ||
| t = q = foundHost; | ||
| while (*t) { | ||
| if (!xisspace(*t)) { | ||
| *q = *t; | ||
| ++q; | ||
| } | ||
| ++t; | ||
| } | ||
| *q = '\0'; | ||
| } | ||
| if (StripAnyWsp(foundHost)) { | ||
| urlpath[0] = '/'; | ||
| urlpath[1] = '\0'; | ||
| *foundQuery = '\0'; | ||
| } | ||
|
|
||
| debugs(23, 3, "Split URL '" << rawUrl << "' into proto='" << scheme.image() << "', host='" << foundHost << "', port='" << foundPort << "', path='" << urlpath << "'"); | ||
| debugs(23, 3, "Split URL '" << rawUrl << "' into proto='" << scheme.image() << "', host='" << foundHost << "', port='" << foundPort << "', path='" << urlpath << "', query='" << foundQuery << "'"); | ||
|
|
||
| if (Config.onoff.check_hostnames && | ||
| strspn(foundHost, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(foundHost)) { | ||
|
|
@@ -536,41 +629,8 @@ AnyP::Uri::parse(const HttpRequestMethod& method, const SBuf &rawUrl) | |
| return false; | ||
| } | ||
|
|
||
| if (stringHasWhitespace(urlpath)) { | ||
| debugs(23, 2, "URI has whitespace: {" << rawUrl << "}"); | ||
|
|
||
| switch (Config.uri_whitespace) { | ||
|
|
||
| case URI_WHITESPACE_DENY: | ||
| return false; | ||
|
|
||
| case URI_WHITESPACE_ALLOW: | ||
| break; | ||
|
|
||
| case URI_WHITESPACE_ENCODE: | ||
| t = rfc1738_escape_unescaped(urlpath); | ||
| xstrncpy(urlpath, t, MAX_URL); | ||
| break; | ||
|
|
||
| case URI_WHITESPACE_CHOP: | ||
| *(urlpath + strcspn(urlpath, w_space)) = '\0'; | ||
| break; | ||
|
|
||
| case URI_WHITESPACE_STRIP: | ||
| default: | ||
| t = q = urlpath; | ||
| while (*t) { | ||
| if (!xisspace(*t)) { | ||
| *q = *t; | ||
| ++q; | ||
| } | ||
| ++t; | ||
| } | ||
| *q = '\0'; | ||
| } | ||
| } | ||
|
|
||
| setScheme(scheme); | ||
| query(SBuf(foundQuery)); | ||
| path(urlpath); | ||
| host(foundHost); | ||
| userInfo(SBuf(login)); | ||
|
|
@@ -775,8 +835,12 @@ SBuf & | |
| AnyP::Uri::absolutePath() const | ||
| { | ||
| if (absolutePath_.isEmpty()) { | ||
| // TODO: Encode each URI subcomponent in path_ as needed. | ||
| absolutePath_ = Encode(path(), PathChars()); | ||
| if (!path().isEmpty()) | ||
| absolutePath_ = Encode(path(), PathChars()); | ||
| if (!query().isEmpty()) { | ||
| absolutePath_.append("?"); | ||
| absolutePath_.append(Encode(query(), QueryChars())); | ||
| } | ||
|
Comment on lines
+838
to
+843
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FYI: This TODO implementation is the fix for bug 5523 titular issue for PR #2289. The rest of this PR is logic supporting this change. |
||
| } | ||
|
|
||
| return absolutePath_; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should not add serious features to this bad (on several levels) Uri::parse() code until we refactor it to address its known quality problems. Two recent additions resulted in two serious bugs, and some of those recent changes still need to be polished to reach acceptable quality levels. We should not pile up more serious changes on top of this terrible foundation.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You seems to have missed the fact that this PR is part of such a refactor.
Specifically that prior commit ff60b10 left the '?' query delimiter in the
path()output (where it got wrongly encoded). After this PR we can review all the previously affected callers knowing that path and query are actually separate or re-combined properly depending on which method is used to access them. I expect to find that they are all correct after this change.