Skip to content

Commit

Permalink
Add Path-accepting Jsoup methods (#2055)
Browse files Browse the repository at this point in the history
Add Path-accepting Jsoup methods

Also improve file parsing using SeekableByteChannel

Clean up parseInputStream

---------

Co-authored-by: Jonathan Hedley <jonathan@hedley.net>
  • Loading branch information
Isira-Seneviratne and jhy authored Dec 30, 2023
1 parent 38615af commit c37e8d6
Show file tree
Hide file tree
Showing 6 changed files with 172 additions and 31 deletions.
9 changes: 9 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
# jsoup Changelog

## 1.18.1 (Pending)

### Improvements

* Added `Path` accepting parse methods: `Jsoup.parse(Path)`, `Jsoup.parse(path, charsetName, baseUri, parser)`,
etc. [2055](https://github.com/jhy/jsoup/pull/2055)

---

## 1.17.2 (2023-Dec-29)

### Improvements
Expand Down
4 changes: 4 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,12 @@
<version>2.3.3_r2</version>
</signature>
<ignores>
<ignore>java.io.File</ignore> <!-- File#toPath() -->
<ignore>java.nio.file.*</ignore>
<ignore>java.nio.channels.SeekableByteChannel</ignore>
<ignore>java.util.function.*</ignore>
<ignore>java.util.stream.*</ignore>
<ignore>java.lang.Throwable</ignore> <!-- Throwable#addSuppressed(Throwable) -->
<ignore>java.lang.ThreadLocal</ignore>
<ignore>java.io.UncheckedIOException</ignore>
<ignore>java.util.List</ignore> <!-- List#stream() -->
Expand Down
67 changes: 67 additions & 0 deletions src/main/java/org/jsoup/Jsoup.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.nio.file.Path;

/**
The core public access point to the jsoup functionality.
Expand Down Expand Up @@ -183,6 +184,72 @@ public static Document parse(File file, @Nullable String charsetName, String bas
return DataUtil.load(file, charsetName, baseUri, parser);
}

/**
Parse the contents of a file as HTML.
@param path file to load HTML from. Supports gzipped files (ending in .z or .gz).
@param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
present, or fall back to {@code UTF-8} (which is often safe to do).
@param baseUri The URL where the HTML was retrieved from, to resolve relative links against.
@return sane HTML
@throws IOException if the file could not be found, or read, or if the charsetName is invalid.
@since 1.18.1
*/
public static Document parse(Path path, @Nullable String charsetName, String baseUri) throws IOException {
return DataUtil.load(path, charsetName, baseUri);
}

/**
Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs.
@param path file to load HTML from. Supports gzipped files (ending in .z or .gz).
@param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
present, or fall back to {@code UTF-8} (which is often safe to do).
@return sane HTML
@throws IOException if the file could not be found, or read, or if the charsetName is invalid.
@see #parse(File, String, String) parse(file, charset, baseUri)
@since 1.18.1
*/
public static Document parse(Path path, @Nullable String charsetName) throws IOException {
return DataUtil.load(path, charsetName, path.toAbsolutePath().toString());
}

/**
Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs.
The charset used to read the file will be determined by the byte-order-mark (BOM), or a {@code <meta charset>} tag,
or if neither is present, will be {@code UTF-8}.
<p>This is the equivalent of calling {@link #parse(File, String) parse(file, null)}</p>
@param path the file to load HTML from. Supports gzipped files (ending in .z or .gz).
@return sane HTML
@throws IOException if the file could not be found or read.
@see #parse(Path, String, String) parse(file, charset, baseUri)
@since 1.18.1
*/
public static Document parse(Path path) throws IOException {
return DataUtil.load(path, null, path.toAbsolutePath().toString());
}

/**
Parse the contents of a file as HTML.
@param path file to load HTML from. Supports gzipped files (ending in .z or .gz).
@param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if
present, or fall back to {@code UTF-8} (which is often safe to do).
@param baseUri The URL where the HTML was retrieved from, to resolve relative links against.
@param parser alternate {@link Parser#xmlParser() parser} to use.
@return sane HTML
@throws IOException if the file could not be found, or read, or if the charsetName is invalid.
@since 1.18.1
*/
public static Document parse(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
return DataUtil.load(path, charsetName, baseUri, parser);
}

/**
Read an input stream, and parse it to a Document.
Expand Down
83 changes: 53 additions & 30 deletions src/main/java/org/jsoup/helper/DataUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import org.jsoup.internal.ControllableInputStream;
import org.jsoup.internal.Normalizer;
import org.jsoup.internal.SharedConstants;
import org.jsoup.internal.StringUtil;
import org.jsoup.nodes.Comment;
import org.jsoup.nodes.Document;
Expand All @@ -16,7 +15,6 @@
import java.io.BufferedReader;
import java.io.CharArrayReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
Expand All @@ -25,8 +23,12 @@
import java.nio.Buffer;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.channels.Channels;
import java.nio.channels.SeekableByteChannel;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Locale;
import java.util.Random;
import java.util.regex.Matcher;
Expand Down Expand Up @@ -63,7 +65,7 @@ private DataUtil() {}
* @throws IOException on IO error
*/
public static Document load(File file, @Nullable String charsetName, String baseUri) throws IOException {
return load(file, charsetName, baseUri, Parser.htmlParser());
return load(file.toPath(), charsetName, baseUri);
}

/**
Expand All @@ -81,18 +83,48 @@ public static Document load(File file, @Nullable String charsetName, String base
* @since 1.14.2
*/
public static Document load(File file, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
InputStream stream = new FileInputStream(file);
String name = Normalizer.lowerCase(file.getName());
if (name.endsWith(".gz") || name.endsWith(".z")) {
// unfortunately file input streams don't support marks (why not?), so we will close and reopen after read
boolean zipped;
try {
zipped = (stream.read() == 0x1f && stream.read() == 0x8b); // gzip magic bytes
} finally {
stream.close();
return load(file.toPath(), charsetName, baseUri, parser);
}

/**
* Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
* are supported in addition to uncompressed files.
*
* @param path file to load
* @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
* the file will always override this setting.
* @param baseUri base URI of document, to resolve relative links against
* @return Document
* @throws IOException on IO error
*/
public static Document load(Path path, @Nullable String charsetName, String baseUri) throws IOException {
return load(path, charsetName, baseUri, Parser.htmlParser());
}

/**
* Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
* are supported in addition to uncompressed files.
*
* @param path file to load
* @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
* the file will always override this setting.
* @param baseUri base URI of document, to resolve relative links against
* @param parser alternate {@link Parser#xmlParser() parser} to use.
* @return Document
* @throws IOException on IO error
* @since 1.17.2
*/
public static Document load(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
final SeekableByteChannel byteChannel = Files.newByteChannel(path);
InputStream stream = Channels.newInputStream(byteChannel);
String name = Normalizer.lowerCase(path.getFileName().toString());
if (name.endsWith(".gz") || name.endsWith(".z")) {
final boolean zipped = (stream.read() == 0x1f && stream.read() == 0x8b); // gzip magic bytes
byteChannel.position(0); // reset to start of file
if (zipped) {
stream = new GZIPInputStream(stream);
}
stream = zipped ? new GZIPInputStream(new FileInputStream(file)) : new FileInputStream(file);
}
return parseInputStream(stream, charsetName, baseUri, parser);
}
Expand Down Expand Up @@ -139,16 +171,15 @@ static void crossStreams(final InputStream in, final OutputStream out) throws IO
static Document parseInputStream(@Nullable InputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
if (input == null) // empty body
return new Document(baseUri);
input = ControllableInputStream.wrap(input, DefaultBufferSize, 0);

@Nullable Document doc = null;

// read the start of the stream and look for a BOM or meta charset
try {
input.mark(DefaultBufferSize);
ByteBuffer firstBytes = readToByteBuffer(input, firstReadBufferSize - 1); // -1 because we read one more to see if completed. First read is < buffer size, so can't be invalid.
boolean fullyRead = (input.read() == -1);
input.reset();
try (InputStream wrappedInputStream = ControllableInputStream.wrap(input, DefaultBufferSize, 0)) {
wrappedInputStream.mark(DefaultBufferSize);
ByteBuffer firstBytes = readToByteBuffer(wrappedInputStream, firstReadBufferSize - 1); // -1 because we read one more to see if completed. First read is < buffer size, so can't be invalid.
boolean fullyRead = (wrappedInputStream.read() == -1);
wrappedInputStream.reset();

// look for BOM - overrides any other header or input
BomCharset bomCharset = detectCharsetFromBom(firstBytes);
Expand Down Expand Up @@ -189,9 +220,8 @@ else if (first instanceof Comment) {
if (comment.isXmlDeclaration())
decl = comment.asXmlDeclaration();
}
if (decl != null) {
if (decl.name().equalsIgnoreCase("xml"))
foundCharset = decl.attr("encoding");
if (decl != null && decl.name().equalsIgnoreCase("xml")) {
foundCharset = decl.attr("encoding");
}
}
foundCharset = validateCharset(foundCharset);
Expand All @@ -208,8 +238,7 @@ else if (first instanceof Comment) {
if (doc == null) {
if (charsetName == null)
charsetName = defaultCharsetName;
BufferedReader reader = new BufferedReader(new InputStreamReader(input, Charset.forName(charsetName)), DefaultBufferSize); // Android level does not allow us try-with-resources
try {
try (BufferedReader reader = new BufferedReader(new InputStreamReader(wrappedInputStream, Charset.forName(charsetName)), DefaultBufferSize)) {
if (bomCharset != null && bomCharset.offset) { // creating the buffered reader ignores the input pos, so must skip here
long skipped = reader.skip(1);
Validate.isTrue(skipped == 1); // WTF if this fails.
Expand All @@ -227,14 +256,8 @@ else if (first instanceof Comment) {
doc.charset(UTF_8);
}
}
finally {
reader.close();
}
}
}
finally {
input.close();
}
return doc;
}

Expand Down
29 changes: 28 additions & 1 deletion src/test/java/org/jsoup/helper/DataUtilTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;

import static org.jsoup.integration.ParseTest.getFile;
import static org.jsoup.integration.ParseTest.getPath;
import static org.junit.jupiter.api.Assertions.*;

public class DataUtilTest {
Expand Down Expand Up @@ -207,13 +209,21 @@ public void supportsXmlCharsetDeclaration() throws IOException {


@Test
public void lLoadsGzipFile() throws IOException {
public void loadsGzipFile() throws IOException {
File in = getFile("/htmltests/gzip.html.gz");
Document doc = Jsoup.parse(in, null);
assertEquals("Gzip test", doc.title());
assertEquals("This is a gzipped HTML file.", doc.selectFirst("p").text());
}

@Test
public void loadsGzipPath() throws IOException {
Path in = getPath("/htmltests/gzip.html.gz");
Document doc = Jsoup.parse(in, null);
assertEquals("Gzip test", doc.title());
assertEquals("This is a gzipped HTML file.", doc.selectFirst("p").text());
}

@Test
public void loadsZGzipFile() throws IOException {
// compressed on win, with z suffix
Expand All @@ -223,6 +233,15 @@ public void loadsZGzipFile() throws IOException {
assertEquals("This is a gzipped HTML file.", doc.selectFirst("p").text());
}

@Test
public void loadsZGzipPath() throws IOException {
// compressed on win, with z suffix
Path in = getPath("/htmltests/gzip.html.z");
Document doc = Jsoup.parse(in, null);
assertEquals("Gzip test", doc.title());
assertEquals("This is a gzipped HTML file.", doc.selectFirst("p").text());
}

@Test
public void handlesFakeGzipFile() throws IOException {
File in = getFile("/htmltests/fake-gzip.html.gz");
Expand All @@ -231,6 +250,14 @@ public void handlesFakeGzipFile() throws IOException {
assertEquals("And should still be readable.", doc.selectFirst("p").text());
}

@Test
public void handlesFakeGzipPath() throws IOException {
Path in = getPath("/htmltests/fake-gzip.html.gz");
Document doc = Jsoup.parse(in, null);
assertEquals("This is not gzipped", doc.title());
assertEquals("And should still be readable.", doc.selectFirst("p").text());
}

// an input stream to give a range of output sizes, that changes on each read
static class VaryingReadInputStream extends InputStream {
final InputStream in;
Expand Down
11 changes: 11 additions & 0 deletions src/test/java/org/jsoup/integration/ParseTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.zip.GZIPInputStream;

import static org.junit.jupiter.api.Assertions.*;
Expand Down Expand Up @@ -133,6 +135,15 @@ public static File getFile(String resourceName) {
}
}

public static Path getPath(String resourceName) {
try {
URL resource = ParseTest.class.getResource(resourceName);
return resource != null ? Paths.get(resource.toURI()) : Paths.get("/404");
} catch (URISyntaxException e) {
throw new IllegalStateException(e);
}
}

public static InputStream inputStreamFrom(String s) {
return new ByteArrayInputStream(s.getBytes(StandardCharsets.UTF_8));
}
Expand Down

0 comments on commit c37e8d6

Please sign in to comment.