Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
188 changes: 126 additions & 62 deletions src/anyp/Uri.cc
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,19 @@ PathChars()
return pathValid;
}

/// Characters which are valid within a URI query section
static const CharacterSet &
QueryChars()
{
/*
* RFC 3986 section 3.4
*
* query = *( pchar / "/" / "?" )
*/
static const auto queryValid = CharacterSet("query", "/?") + PathChars();
return queryValid;
}

/**
* Governed by RFC 3986 section 2.1
*/
Expand Down Expand Up @@ -208,6 +221,17 @@ AnyP::Uri::path() const
return path_;
}

const SBuf &
AnyP::Uri::query() const
{
// FTP does not have a query section
static SBuf empty;
if (scheme_ == AnyP::PROTO_FTP)
return empty;

return query_;
}

void
urlInitialize(void)
{
Expand Down Expand Up @@ -310,6 +334,47 @@ urlAppendDomain(char *host)
return true;
}

/// \returns whether whitespace has 'chopped' the URL apart.
static bool
StripAnyWsp(char *buf)
{
if (stringHasWhitespace(buf)) {
switch (Config.uri_whitespace) {

case URI_WHITESPACE_DENY:
throw TextException("URI has whitespace", Here());

case URI_WHITESPACE_ALLOW:
break;

case URI_WHITESPACE_ENCODE: {
auto *t = rfc1738_escape_unescaped(buf);
xstrncpy(buf, t, MAX_URL);
}
break;

case URI_WHITESPACE_CHOP:
*(buf + strcspn(buf, w_space)) = '\0';
return true;

case URI_WHITESPACE_STRIP:
default: {
char *t = buf;
char *q = buf;
while (*t) {
if (!xisspace(*t)) {
*q = *t;
++q;
}
++t;
}
*q = '\0';
}
}
}
return false;
}

/*
* Parse a URI/URL.
*
Expand All @@ -330,14 +395,14 @@ AnyP::Uri::parse(const HttpRequestMethod& method, const SBuf &rawUrl)
LOCAL_ARRAY(char, login, MAX_URL);
LOCAL_ARRAY(char, foundHost, MAX_URL);
LOCAL_ARRAY(char, urlpath, MAX_URL);
LOCAL_ARRAY(char, foundQuery, MAX_URL);
char *t = nullptr;
char *q = nullptr;
int foundPort;
int l;
int i;
const char *src;
char *dst;
foundHost[0] = urlpath[0] = login[0] = '\0';
foundHost[0] = urlpath[0] = foundQuery[0] = login[0] = '\0';

if ((l = rawUrl.length()) + Config.appendDomainLen > (MAX_URL - 1)) {
debugs(23, DBG_IMPORTANT, MYNAME << "URL too large (" << l << " bytes)");
Expand Down Expand Up @@ -411,23 +476,58 @@ AnyP::Uri::parse(const HttpRequestMethod& method, const SBuf &rawUrl)
return false;
*dst = '\0';

// We are looking at path-abempty.
if (*src != '/') {
// path-empty, including the end of the `src` c-string cases
urlpath[0] = '/';
dst = &urlpath[1];
} else {
bool chopped = false;
if (*src == '/') {
dst = urlpath;
}
/* Then everything from / (inclusive) until \r\n or \0 - that's urlpath */
for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) {
*dst = *src;
/* Then everything from / (inclusive) until '?', '#', '\r\n' or '\0' - that's urlpath */
for (; i < l && *src != '?' && *src != '#' && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) {
*dst = *src;
}
*dst = '\0';
chopped = StripAnyWsp(urlpath);
rfc1738_unescape(urlpath);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should not add serious features to this bad (on several levels) Uri::parse() code until we refactor it to address its known quality problems. Two recent additions resulted in two serious bugs, and some of those recent changes still need to be polished to reach acceptable quality levels. We should not pile up more serious changes on top of this terrible foundation.

Copy link
Contributor Author

@yadij yadij Oct 31, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You seems to have missed the fact that this PR is part of such a refactor.

Specifically that prior commit ff60b10 left the '?' query delimiter in the path() output (where it got wrongly encoded). After this PR we can review all the previously affected callers knowing that path and query are actually separate or re-combined properly depending on which method is used to access them. I expect to find that they are all correct after this change.

} else {
// We should be looking at path-abempty. relative-path is not supported (yet).
urlpath[0] = '/';
urlpath[1] = '\0';
}

/* We -could- be at the end of the buffer here */
if (i > l)
return false;
*dst = '\0';

if (*src == '?') {
dst = foundQuery;
++src; // skip '?' delimiter
++i; // keep track of bytes 'consumed' from src
/* Then everything from '?' (exclusive) until '#', '\r\n' or '\0' - that's query */
for (; i < l && *src != '#' && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) {
*dst = *src;
}
*dst = '\0';
if (chopped)
*foundQuery = '\0';
else
chopped = StripAnyWsp(foundQuery);
rfc1738_unescape(foundQuery);
}
if (i > l)
return false;

if (*src == '#') {
++src; // skip '#' delimiter
++i; // keep track of bytes 'consumed' from src
/* Then everything from '#' (exclusive) until '\r\n' or '\0' - that's fragment */
for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src) {
; // fragment component is not expected in network protocols. discard for now.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If official parse() code does not discard fragments, then this PR cannot use a "for now" excuse to start discarding them; most likely, this PR should not discard fragments at all, but we can discuss better excuses if really needed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Official code calls it part of "path" component. Which is patently false.

This code splits the components correctly per RFC3986. Keeping the components which are used by ftp, and http(s) URLs.

If you insist on retaining fragment support, I can easily add it back in. Though it would be pointless to store, since it was only actually used by gopher: URLs.

}
}

if (i > l)
return false;

if (*src != '\r' && *src != '\n' && *src != '\0')
throw TextException("invalid URL", Here());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A parsing method that reports success/failure via its boolean return value should not throw on invalid input. The two result-reporting designs are mutually exclusive.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

um, I don't think that is correct use of the term "mutually exclusive".

This method is in transition towards the state where parsing errors are thrown, and the boolean false means needs-more-data. The bits of logic where you do not see that existing are yet to be refactored away.


// If the parsed scheme has no (known) default port, and there is no
// explicit port, then we will reject the zero port during foundPort
Expand Down Expand Up @@ -490,27 +590,20 @@ AnyP::Uri::parse(const HttpRequestMethod& method, const SBuf &rawUrl)
*t = '\0';
++t;
foundPort = atoi(t);
// BUG: fails to detect non-DIGIT garbage like "http://localhost:80fubar"
}
}

for (t = foundHost; *t; ++t)
*t = xtolower(*t);

if (stringHasWhitespace(foundHost)) {
if (URI_WHITESPACE_STRIP == Config.uri_whitespace) {
t = q = foundHost;
while (*t) {
if (!xisspace(*t)) {
*q = *t;
++q;
}
++t;
}
*q = '\0';
}
if (StripAnyWsp(foundHost)) {
urlpath[0] = '/';
urlpath[1] = '\0';
*foundQuery = '\0';
}

debugs(23, 3, "Split URL '" << rawUrl << "' into proto='" << scheme.image() << "', host='" << foundHost << "', port='" << foundPort << "', path='" << urlpath << "'");
debugs(23, 3, "Split URL '" << rawUrl << "' into proto='" << scheme.image() << "', host='" << foundHost << "', port='" << foundPort << "', path='" << urlpath << "', query='" << foundQuery << "'");

if (Config.onoff.check_hostnames &&
strspn(foundHost, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(foundHost)) {
Expand All @@ -536,41 +629,8 @@ AnyP::Uri::parse(const HttpRequestMethod& method, const SBuf &rawUrl)
return false;
}

if (stringHasWhitespace(urlpath)) {
debugs(23, 2, "URI has whitespace: {" << rawUrl << "}");

switch (Config.uri_whitespace) {

case URI_WHITESPACE_DENY:
return false;

case URI_WHITESPACE_ALLOW:
break;

case URI_WHITESPACE_ENCODE:
t = rfc1738_escape_unescaped(urlpath);
xstrncpy(urlpath, t, MAX_URL);
break;

case URI_WHITESPACE_CHOP:
*(urlpath + strcspn(urlpath, w_space)) = '\0';
break;

case URI_WHITESPACE_STRIP:
default:
t = q = urlpath;
while (*t) {
if (!xisspace(*t)) {
*q = *t;
++q;
}
++t;
}
*q = '\0';
}
}

setScheme(scheme);
query(SBuf(foundQuery));
path(urlpath);
host(foundHost);
userInfo(SBuf(login));
Expand Down Expand Up @@ -775,8 +835,12 @@ SBuf &
AnyP::Uri::absolutePath() const
{
if (absolutePath_.isEmpty()) {
// TODO: Encode each URI subcomponent in path_ as needed.
absolutePath_ = Encode(path(), PathChars());
if (!path().isEmpty())
absolutePath_ = Encode(path(), PathChars());
if (!query().isEmpty()) {
absolutePath_.append("?");
absolutePath_.append(Encode(query(), QueryChars()));
}
Comment on lines +838 to +843
Copy link
Contributor Author

@yadij yadij Dec 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI: This TODO implementation is the fix for bug 5523 titular issue for PR #2289. The rest of this PR is logic supporting this change.

}

return absolutePath_;
Expand Down
7 changes: 5 additions & 2 deletions src/anyp/Uri.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,9 @@ class Uri
void path(const SBuf &p) {path_=p; touch();}
const SBuf &path() const;

void query(const SBuf &p) { query_=p; touch(); }
const SBuf &query() const;

/**
* Merge a relative-path URL into the existing URI details.
* Implements RFC 3986 section 5.2.3
Expand Down Expand Up @@ -191,8 +194,8 @@ class Uri

Port port_; ///< authority port subcomponent

// XXX: for now includes query-string.
SBuf path_; ///< URI path segment
SBuf path_; ///< URI path component
SBuf query_; ///< URI query component

// pre-assembled URI forms
mutable SBuf authorityHttp_; ///< RFC 7230 section 5.3.3 authority, maybe without default-port
Expand Down
24 changes: 7 additions & 17 deletions src/cache_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -125,17 +125,14 @@ CacheManager::WellKnownUrlPathPrefix()
Mgr::Command::Pointer
CacheManager::ParseUrl(const AnyP::Uri &uri)
{
Parser::Tokenizer tok(uri.path());

Assure(tok.skip(WellKnownUrlPathPrefix()));
auto upath = uri.path();
Assure(upath.startsWith(WellKnownUrlPathPrefix()));

Mgr::Command::Pointer cmd = new Mgr::Command();
cmd->params.httpUri = SBufToString(uri.absolute());

static const auto fieldChars = CharacterSet("mgr-field", "?#").complement();

SBuf action;
if (!tok.prefix(action, fieldChars)) {
SBuf action = upath.substr(WellKnownUrlPathPrefix().length()); // path without the prefix
if (action.isEmpty()) {
static const SBuf indexReport("index");
action = indexReport;
}
Expand All @@ -150,16 +147,9 @@ CacheManager::ParseUrl(const AnyP::Uri &uri)
throw TextException(ToSBuf("action '", action, "' is ", prot), Here());
cmd->profile = profile;

// TODO: fix when AnyP::Uri::parse() separates path?query#fragment
SBuf params;
if (tok.skip('?')) {
params = tok.remaining();
Mgr::QueryParams::Parse(tok, cmd->params.queryParams);
}

if (!tok.skip('#') && !tok.atEnd())
throw TextException("invalid characters in URL", Here());
// else ignore #fragment (if any)
SBuf params = uri.query();
if (!params.isEmpty())
Mgr::QueryParams::Parse(params, cmd->params.queryParams);

debugs(16, 3, "MGR request: host=" << uri.host() << ", action=" << action << ", params=" << params);

Expand Down
9 changes: 3 additions & 6 deletions src/carp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -185,14 +185,11 @@ carpSelectParent(PeerSelector *ps)
key.appendf(":%hu", request->url.port().value_or(0));
}
if (tp->options.carp_key.path) {
// XXX: fix when path and query are separate
key.append(request->url.absolutePath().substr(0,request->url.absolutePath().find('?'))); // 0..N
key.append(request->url.path());
}
if (tp->options.carp_key.params) {
// XXX: fix when path and query are separate
SBuf::size_type pos;
if ((pos=request->url.absolutePath().find('?')) != SBuf::npos)
key.append(request->url.absolutePath().substr(pos)); // N..npos
key.append('?');
key.append(request->url.query());
}
}
// if the url-based key is empty, e.g. because the user is
Expand Down
9 changes: 3 additions & 6 deletions src/mgr/QueryParams.cc
Original file line number Diff line number Diff line change
Expand Up @@ -109,18 +109,15 @@ ParseParamValue(const SBuf &rawValue)
* value = *pchar | ( 1*DIGIT *( ',' 1*DIGIT ) )
*/
void
Mgr::QueryParams::Parse(Parser::Tokenizer &tok, QueryParams &aParams)
Mgr::QueryParams::Parse(SBuf &buf, QueryParams &aParams)
{
static const CharacterSet nameChars = CharacterSet("param-name", "_") + CharacterSet::ALPHA + CharacterSet::DIGIT;
static const CharacterSet valueChars = CharacterSet("param-value", "&= #").complement();
static const CharacterSet delimChars("param-delim", "&");

while (!tok.atEnd()) {
Parser::Tokenizer tok(buf);

// TODO: remove '#' processing when AnyP::Uri splits 'query#fragment' properly
// #fragment handled by caller. Do not throw.
if (tok.remaining()[0] == '#')
return;
while (!tok.atEnd()) {

if (tok.skipAll(delimChars))
continue;
Expand Down
4 changes: 2 additions & 2 deletions src/mgr/QueryParams.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

#include "ipc/forward.h"
#include "mgr/QueryParam.h"
#include "parser/forward.h"
#include "sbuf/forward.h"
#include "SquidString.h"

#include <utility>
Expand All @@ -34,7 +34,7 @@ class QueryParams
void pack(Ipc::TypedMsgHdr& msg) const; ///< store params into msg
void unpack(const Ipc::TypedMsgHdr& msg); ///< load params from msg
/// parses the query string parameters
static void Parse(Parser::Tokenizer &, QueryParams &);
static void Parse(SBuf &, QueryParams &);

private:
/// find query parameter by name
Expand Down
2 changes: 1 addition & 1 deletion src/tests/stub_libmgr.cc
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ void Mgr::IoAction::dump(StoreEntry*) STUB
Mgr::QueryParam::Pointer Mgr::QueryParams::get(const String&) const STUB_RETVAL(Mgr::QueryParam::Pointer(nullptr))
void Mgr::QueryParams::pack(Ipc::TypedMsgHdr&) const STUB
void Mgr::QueryParams::unpack(const Ipc::TypedMsgHdr&) STUB
void Mgr::QueryParams::Parse(Parser::Tokenizer &, QueryParams &) STUB
void Mgr::QueryParams::Parse(SBuf &, QueryParams &) STUB
//private:
//Params::const_iterator Mgr::QueryParams::find(const String&) const STUB_RETVAL(new Mgr::Params::const_iterator(*this))
Mgr::QueryParam::Pointer Mgr::QueryParams::CreateParam(QueryParam::Type) STUB_RETVAL(Mgr::QueryParam::Pointer(nullptr))
Expand Down
Loading
Loading