diff --git a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java index 48c4a666ac..b350af9a16 100644 --- a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java +++ b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java @@ -242,9 +242,11 @@ public String normalize(String urlString, String scope) if (normalizePath) { // check for unnecessary use of "/../", "/./", and "//" if (changed) { - url = new URL(protocol, host, port, file); + URL u = new URL(protocol, host, port, file); + file2 = getFileWithNormalizedPath(u); + } else { + file2 = getFileWithNormalizedPath(url); } - file2 = getFileWithNormalizedPath(url); if (!file.equals(file2)) { changed = true; file = file2; @@ -252,8 +254,25 @@ public String normalize(String urlString, String scope) } if (changed) { - url = new URL(protocol, host, port, file); - urlString = url.toString(); + if (protocol.equals("http") || protocol.equals("https") + || url.getUserInfo() == null) { + url = new URL(protocol, host, port, file); + urlString = url.toString(); + } else { + /* + * NUTCH-3087 - userinfo is required for protocols with frequent + * authentication. Note: need to build the URL string directly, because + * there is no URL constructor which takes the userinfo as parameter. + */ + StringBuilder sb = new StringBuilder(); + sb.append(protocol).append("://").append(url.getUserInfo()).append('@') + .append(host); + if (port != -1) { + sb.append(':').append(port); + } + sb.append(file); + urlString = sb.toString(); + } } return urlString; diff --git a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java index 8f3a1fdaa9..d48e585101 100644 --- a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java +++ b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java @@ -216,6 +216,32 @@ public void testNormalizer() throws Exception { normalizeTest("file:/var/www/html/////./bar/index.html", "file:/var/www/html/bar/index.html"); } + + @Test + public void testNUTCH3087() throws Exception { + // NUTCH-3087 userinfo to be kept in URLs with protocols usually requiring + // authentication + normalizeTest("ftp://user@ftp.example.org/path/file.txt", + "ftp://user@ftp.example.org/path/file.txt"); + normalizeTest("ftp://john.doe@ftp.example.org/", + "ftp://john.doe@ftp.example.org/"); + normalizeTest("ftp://user:password@ftp.example.org/path/file.txt", + "ftp://user:password@ftp.example.org/path/file.txt"); + // But for HTTP(S) the userinfo should be removed. + // (example from https://en.wikipedia.org/wiki/Uniform_Resource_Identifier) + normalizeTest( + "https://john.doe@www.example.com:1234/forum/questions/?tag=networking&order=newest#top", + "https://www.example.com:1234/forum/questions/?tag=networking&order=newest"); + // URLs with IPv6 address + normalizeTest("ftp://user@[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/../path/file.txt", + "ftp://user@[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/path/file.txt"); + normalizeTest("https://user@[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/", + "https://[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/"); + normalizeTest("https://user@[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]:443/", + "https://[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/"); + normalizeTest("https://user@[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/path/../to/index.html", + "https://[2600:1f18:200d:fb00:2b74:867c:ab0c:150a]/to/index.html"); + } @Test public void testCurlyBraces() throws Exception {