Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NUTCH-3087 BasicURLNormalizer to keep userinfo for protocols which might require it #845

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -242,18 +242,37 @@ public String normalize(String urlString, String scope)
if (normalizePath) {
// check for unnecessary use of "/../", "/./", and "//"
if (changed) {
url = new URL(protocol, host, port, file);
URL u = new URL(protocol, host, port, file);
file2 = getFileWithNormalizedPath(u);
} else {
file2 = getFileWithNormalizedPath(url);
}
file2 = getFileWithNormalizedPath(url);
if (!file.equals(file2)) {
changed = true;
file = file2;
}
}

if (changed) {
url = new URL(protocol, host, port, file);
urlString = url.toString();
if (protocol.equals("http") || protocol.equals("https")
|| url.getUserInfo() == null) {
url = new URL(protocol, host, port, file);
urlString = url.toString();
} else {
/*
* NUTCH-3087 - userinfo is required for protocols with frequent
* authentication. Note: need to build the URL string directly, because
* there is no URL constructor which takes the userinfo as parameter.
*/
StringBuilder sb = new StringBuilder();
sb.append(protocol).append("://").append(url.getUserInfo()).append('@')
.append(host);
if (port != -1) {
sb.append(':').append(port);
}
sb.append(file);
urlString = sb.toString();
}
}

return urlString;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,32 @@ public void testNormalizer() throws Exception {
normalizeTest("file:/var/www/html/////./bar/index.html",
"file:/var/www/html/bar/index.html");
}

@Test
public void testNUTCH3087() throws Exception {
// NUTCH-3087 userinfo to be kept in URLs with protocols usually requiring
// authentication
normalizeTest("ftp://user@ftp.example.org/path/file.txt",
"ftp://user@ftp.example.org/path/file.txt");
normalizeTest("ftp://john.doe@ftp.example.org/",
"ftp://john.doe@ftp.example.org/");
normalizeTest("ftp://user:password@ftp.example.org/path/file.txt",
"ftp://user:password@ftp.example.org/path/file.txt");
// But for HTTP(S) the userinfo should be removed.
// (example from https://en.wikipedia.org/wiki/Uniform_Resource_Identifier)
normalizeTest(
"https://john.doe@www.example.com:1234/forum/questions/?tag=networking&order=newest#top",
"https://www.example.com:1234/forum/questions/?tag=networking&order=newest");
// URLs with IPv6 address
normalizeTest("ftp://user@[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/../path/file.txt",
"ftp://user@[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/path/file.txt");
normalizeTest("https://user@[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/",
"https://[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/");
normalizeTest("https://user@[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]:443/",
"https://[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/");
normalizeTest("https://user@[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/path/../to/index.html",
"https://[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/to/index.html");
}

@Test
public void testCurlyBraces() throws Exception {
Expand Down
Loading