From f2f246a61a4c654ef5ff65a530c2c0529935a304 Mon Sep 17 00:00:00 2001
From: Sebastian Thomschke <sebthom@users.noreply.github.com>
Date: Tue, 20 Aug 2024 15:34:19 +0200
Subject: [PATCH 1/2] fix: DocumentInputStream does not handle surrogate pairs
 correctly

---
 org.eclipse.lsp4e.test/pom.xml                |   2 +-
 .../test/internal/CharsInputStreamTest.java   | 127 ++++++++
 .../internal/DocumentInputStreamTest.java     | 152 +++++++++
 .../eclipse/lsp4e/DocumentInputStream.java    |  36 +--
 .../lsp4e/internal/CharsInputStream.java      | 295 ++++++++++++++++++
 .../eclipse/lsp4e/internal/DocumentUtil.java  |  20 +-
 6 files changed, 602 insertions(+), 30 deletions(-)
 create mode 100644 org.eclipse.lsp4e.test/src/org/eclipse/lsp4e/test/internal/CharsInputStreamTest.java
 create mode 100644 org.eclipse.lsp4e.test/src/org/eclipse/lsp4e/test/internal/DocumentInputStreamTest.java
 create mode 100644 org.eclipse.lsp4e/src/org/eclipse/lsp4e/internal/CharsInputStream.java
diff --git a/org.eclipse.lsp4e.test/pom.xml b/org.eclipse.lsp4e.test/pom.xml
index 42684cb2c..7192c33da 100644
--- a/org.eclipse.lsp4e.test/pom.xml
+++ b/org.eclipse.lsp4e.test/pom.xml
@@ -42,7 +42,7 @@
 					<useUIHarness>true</useUIHarness>
 					<useUIThread>true</useUIThread>
 					<forkedProcessTimeoutInSeconds>1200</forkedProcessTimeoutInSeconds>
-					<argLine>-Xms1g -Xmx1g -Djava.util.logging.config.file=${project.basedir}/src/jul.properties ${ui.test.vmargs} ${os-jvm-flags}</argLine>
+					<argLine>-Dfile.encoding=${project.build.sourceEncoding} -Xms1g -Xmx1g -Djava.util.logging.config.file=${project.basedir}/src/jul.properties ${ui.test.vmargs} ${os-jvm-flags}</argLine>
 				</configuration>
 			</plugin>
 		</plugins>
diff --git a/org.eclipse.lsp4e.test/src/org/eclipse/lsp4e/test/internal/CharsInputStreamTest.java b/org.eclipse.lsp4e.test/src/org/eclipse/lsp4e/test/internal/CharsInputStreamTest.java
new file mode 100644
index 000000000..80f3c4fea
--- /dev/null
+++ b/org.eclipse.lsp4e.test/src/org/eclipse/lsp4e/test/internal/CharsInputStreamTest.java
@@ -0,0 +1,127 @@
+/*******************************************************************************
+ * Copyright (c) 2024 Sebastian Thomschke and others.
+ * This program and the accompanying materials are made
+ * available under the terms of the Eclipse Public License 2.0
+ * which is available at https://www.eclipse.org/legal/epl-2.0/
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ * Contributors:
+ * Sebastian Thomschke - initial implementation
+ *******************************************************************************/
+package org.eclipse.lsp4e.test.internal;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+import java.util.ArrayList;
+
+import org.eclipse.lsp4e.internal.CharsInputStream;
+import org.junit.Test;
+
+public class CharsInputStreamTest {
+
+	private static final String TEST_ASCII = "Hello, World!";
+
+	private static final String EMOJI = "😊";
+	private static final int EMOJI_BYTES_LEN = EMOJI.getBytes(UTF_8).length;
+	private static final String JAPANESE = "こんにちは";
+	private static final String TEST_UNICODE = EMOJI + JAPANESE;
+	private static final int TEST_UNICODE_BYTES_LEN = TEST_UNICODE.getBytes(UTF_8).length;
+
+	@Test
+	public void testAvailable() throws IOException {
+		try (var is = new CharsInputStream(TEST_ASCII)) {
+			assertEquals(TEST_ASCII.length(), is.available());
+			final byte[] buffer = new byte[4];
+			is.read(buffer);
+			assertEquals(TEST_ASCII.length() - 4, is.available());
+			is.readAllBytes();
+			assertEquals(0, is.available());
+		}
+
+		try (var is = new CharsInputStream(TEST_UNICODE)) {
+			assertTrue(is.available() > 0);
+			is.read(new byte[10]);
+			assertTrue(is.available() > 0);
+			is.readAllBytes();
+			assertEquals(0, is.available());
+		}
+	}
+
+	@Test
+	public void testEndOfStream() throws IOException {
+		try (var is = new CharsInputStream(TEST_UNICODE)) {
+			is.skip(Long.MAX_VALUE);
+			assertEquals(-1, is.read());
+		}
+	}
+
+	@Test
+	public void testReadEachByte() throws IOException {
+		try (var is = new CharsInputStream(TEST_UNICODE)) {
+			final var bytesRead = new ArrayList<Byte>();
+			int b;
+			while ((b = is.read()) != -1) {
+				bytesRead.add((byte) b);
+			}
+
+			final byte[] byteArray = new byte[bytesRead.size()];
+			for (int i = 0; i < bytesRead.size(); i++) {
+				byteArray[i] = bytesRead.get(i);
+			}
+			assertEquals(TEST_UNICODE, new String(byteArray, UTF_8));
+		}
+	}
+
+	@Test
+	public void testReadIntoByteArray() throws IOException {
+		final byte[] buffer = new byte[1024]; // Buffer to read a portion of the text
+
+		try (var is = new CharsInputStream(TEST_UNICODE)) {
+			final int bytesRead = is.read(buffer, 0, buffer.length);
+
+			assertEquals(TEST_UNICODE, new String(buffer, 0, bytesRead, UTF_8));
+		}
+	}
+
+	@Test
+	public void testSkip() throws IOException {
+		try (var is = new CharsInputStream(TEST_UNICODE)) {
+			// skip emoji
+			final long skipped = is.skip(EMOJI_BYTES_LEN);
+			assertEquals(EMOJI_BYTES_LEN, skipped);
+
+			final byte[] japanese = new byte[TEST_UNICODE_BYTES_LEN];
+			final int bytesRead = is.read(japanese);
+
+			assertEquals(JAPANESE, new String(japanese, 0, bytesRead, UTF_8));
+		}
+	}
+
+	@Test
+	public void testHighSurrogateAtEndOfInput() throws IOException {
+		final char[] invalidSequence = { 'A', '\uD800' }; // valid char followed by an isolated high surrogate
+		try (var is = new CharsInputStream(new String(invalidSequence), UTF_8)) {
+			final byte[] result = is.readAllBytes();
+			final String output = new String(result, UTF_8);
+
+			// the high surrogate at the end should be replaced by the
+			// Unicode replacement char
+			assertEquals("A" + CharsInputStream.UNICODE_REPLACEMENT_CHAR, output);
+		}
+	}
+
+	@Test
+	public void testHighSurrogateWithoutLowSurrogate() throws IOException {
+		final char[] invalidSequence = { '\uD800', 'A' }; // \uD800 is a high surrogate, followed by 'A'
+		try (var is = new CharsInputStream(new String(invalidSequence), UTF_8)) {
+			final byte[] result = is.readAllBytes();
+			final String output = new String(result, UTF_8);
+
+			// the invalid surrogate pair should be replaced by the Unicode replacement char
+			assertEquals(CharsInputStream.UNICODE_REPLACEMENT_CHAR + "A", output);
+		}
+	}
+}
diff --git a/org.eclipse.lsp4e.test/src/org/eclipse/lsp4e/test/internal/DocumentInputStreamTest.java b/org.eclipse.lsp4e.test/src/org/eclipse/lsp4e/test/internal/DocumentInputStreamTest.java
new file mode 100644
index 000000000..dc1b35854
--- /dev/null
+++ b/org.eclipse.lsp4e.test/src/org/eclipse/lsp4e/test/internal/DocumentInputStreamTest.java
@@ -0,0 +1,152 @@
+/*******************************************************************************
+ * Copyright (c) 2024 Sebastian Thomschke and others.
+ * This program and the accompanying materials are made
+ * available under the terms of the Eclipse Public License 2.0
+ * which is available at https://www.eclipse.org/legal/epl-2.0/
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ * Contributors:
+ * Sebastian Thomschke - initial implementation
+ *******************************************************************************/
+package org.eclipse.lsp4e.test.internal;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+import java.util.ArrayList;
+
+import org.eclipse.core.runtime.CoreException;
+import org.eclipse.jface.text.IDocument;
+import org.eclipse.lsp4e.DocumentInputStream;
+import org.eclipse.lsp4e.LSPEclipseUtils;
+import org.eclipse.lsp4e.internal.CharsInputStream;
+import org.eclipse.lsp4e.test.utils.AbstractTestWithProject;
+import org.eclipse.lsp4e.test.utils.TestUtils;
+import org.junit.Before;
+import org.junit.Test;
+
+public class DocumentInputStreamTest extends AbstractTestWithProject {
+
+	private static final String TEST_ASCII = "Hello, World!";
+
+	private static final String EMOJI = "😊";
+	private static final int EMOJI_BYTES_LEN = EMOJI.getBytes(UTF_8).length;
+	private static final String JAPANESE = "こんにちは";
+	private static final String TEST_UNICODE = EMOJI + JAPANESE;
+	private static final int TEST_UNICODE_BYTES_LEN = TEST_UNICODE.getBytes(UTF_8).length;
+
+	private IDocument document;
+
+	@Before
+	public void setUp() throws CoreException {
+		final var testFile = TestUtils.createUniqueTestFile(project, TEST_UNICODE);
+		document = LSPEclipseUtils.getDocument(testFile);
+	}
+
+	@Test
+	public void testAvailable() throws IOException {
+		document.set(TEST_ASCII);
+		try (var is = new DocumentInputStream(document)) {
+			assertEquals(UTF_8, is.getCharset());
+			assertEquals(TEST_ASCII.length(), is.available());
+			final byte[] buffer = new byte[4];
+			is.read(buffer);
+			assertEquals(TEST_ASCII.length() - 4, is.available());
+			is.readAllBytes();
+			assertEquals(0, is.available());
+		}
+
+		document.set(TEST_UNICODE);
+		try (var is = new DocumentInputStream(document)) {
+			assertEquals(UTF_8, is.getCharset());
+			assertTrue(is.available() > 0);
+			is.read(new byte[10]);
+			assertTrue(is.available() > 0);
+			is.readAllBytes();
+			assertEquals(0, is.available());
+		}
+	}
+
+	@Test
+	public void testEndOfStream() throws IOException {
+		try (var is = new DocumentInputStream(document)) {
+			assertEquals(UTF_8, is.getCharset());
+			is.skip(Long.MAX_VALUE);
+			assertEquals(-1, is.read());
+		}
+	}
+
+	@Test
+	public void testReadEachByte() throws IOException {
+		try (var is = new DocumentInputStream(document)) {
+			assertEquals(UTF_8, is.getCharset());
+			final var bytesRead = new ArrayList<Byte>();
+			int b;
+			while ((b = is.read()) != -1) {
+				bytesRead.add((byte) b);
+			}
+
+			final byte[] byteArray = new byte[bytesRead.size()];
+			for (int i = 0; i < bytesRead.size(); i++) {
+				byteArray[i] = bytesRead.get(i);
+			}
+			assertEquals(TEST_UNICODE, new String(byteArray, UTF_8));
+		}
+	}
+
+	@Test
+	public void testReadIntoByteArray() throws IOException {
+		final byte[] buffer = new byte[1024]; // Buffer to read a portion of the text
+
+		try (var is = new DocumentInputStream(document)) {
+			assertEquals(UTF_8, is.getCharset());
+			final int bytesRead = is.read(buffer, 0, buffer.length);
+
+			assertEquals(TEST_UNICODE, new String(buffer, 0, bytesRead, UTF_8));
+		}
+	}
+
+	@Test
+	public void testSkip() throws IOException {
+		try (var is = new DocumentInputStream(document)) {
+			assertEquals(UTF_8, is.getCharset());
+			// skip emoji
+			final long skipped = is.skip(EMOJI_BYTES_LEN);
+			assertEquals(EMOJI_BYTES_LEN, skipped);
+
+			final byte[] japanese = new byte[TEST_UNICODE_BYTES_LEN];
+			final int bytesRead = is.read(japanese);
+
+			assertEquals(JAPANESE, new String(japanese, 0, bytesRead, UTF_8));
+		}
+	}
+
+	@Test
+	public void testHighSurrogateAtEndOfInput() throws IOException {
+		document.set(new String(new char[] { 'A', '\uD800' })); // valid char followed by an isolated high surrogate
+		try (var is = new DocumentInputStream(document)) {
+			assertEquals(UTF_8, is.getCharset());
+			final byte[] result = is.readAllBytes();
+			final String output = new String(result, UTF_8);
+
+			// the high surrogate at the end should be replaced by the
+			// Unicode replacement char
+			assertEquals("A" + CharsInputStream.UNICODE_REPLACEMENT_CHAR, output);
+		}
+	}
+
+	@Test
+	public void testHighSurrogateWithoutLowSurrogate() throws IOException {
+		document.set(new String(new char[] { '\uD800', 'A' })); // \uD800 is a high surrogate, followed by 'A'
+		try (var is = new DocumentInputStream(document)) {
+			assertEquals(UTF_8, is.getCharset());
+			final byte[] result = is.readAllBytes();
+			final String output = new String(result, UTF_8);
+
+			// the invalid surrogate pair should be replaced by the Unicode replacement char
+			assertEquals(CharsInputStream.UNICODE_REPLACEMENT_CHAR + "A", output);
+		}
+	}
+}
diff --git a/org.eclipse.lsp4e/src/org/eclipse/lsp4e/DocumentInputStream.java b/org.eclipse.lsp4e/src/org/eclipse/lsp4e/DocumentInputStream.java
index 61d434d3c..f1b856cbc 100644
--- a/org.eclipse.lsp4e/src/org/eclipse/lsp4e/DocumentInputStream.java
+++ b/org.eclipse.lsp4e/src/org/eclipse/lsp4e/DocumentInputStream.java
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2019 Red Hat Inc. and others.
+ * Copyright (c) 2024 Sebastian Thomschke and others.
  * This program and the accompanying materials are made
  * available under the terms of the Eclipse Public License 2.0
  * which is available at https://www.eclipse.org/legal/epl-2.0/
@@ -7,37 +7,17 @@
  * SPDX-License-Identifier: EPL-2.0
  *
  * Contributors:
- *  Mickael Istria (Red Hat Inc.) - initial implementation
+ * Sebastian Thomschke - initial implementation
  *******************************************************************************/
-
 package org.eclipse.lsp4e;
 
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.eclipse.jface.text.BadLocationException;
 import org.eclipse.jface.text.IDocument;
+import org.eclipse.lsp4e.internal.CharsInputStream;
+import org.eclipse.lsp4e.internal.DocumentUtil;
 
-final class DocumentInputStream extends InputStream {
-	private int index = 0;
-	private final IDocument document;
+public final class DocumentInputStream extends CharsInputStream {
 
-	DocumentInputStream(IDocument document) {
-		this.document = document;
+	public DocumentInputStream(final IDocument doc) {
+		super(doc::getChar, doc::getLength, DocumentUtil.getCharset(doc));
 	}
-
-	@Override
-	public int read() throws IOException {
-		if (index < document.getLength()) {
-			try {
-				char res = document.getChar(index);
-				index++;
-				return res;
-			} catch (BadLocationException e) {
-				throw new IOException(e);
-			}
-		}
-		return -1;
-	}
-
-}
\ No newline at end of file
+}
diff --git a/org.eclipse.lsp4e/src/org/eclipse/lsp4e/internal/CharsInputStream.java b/org.eclipse.lsp4e/src/org/eclipse/lsp4e/internal/CharsInputStream.java
new file mode 100644
index 000000000..eaad99a23
--- /dev/null
+++ b/org.eclipse.lsp4e/src/org/eclipse/lsp4e/internal/CharsInputStream.java
@@ -0,0 +1,295 @@
+/*******************************************************************************
+ * Copyright (c) 2024 Sebastian Thomschke and others.
+ * This program and the accompanying materials are made
+ * available under the terms of the Eclipse Public License 2.0
+ * which is available at https://www.eclipse.org/legal/epl-2.0/
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ * Contributors:
+ * Sebastian Thomschke - initial implementation
+ *******************************************************************************/
+package org.eclipse.lsp4e.internal;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CoderResult;
+import java.util.Objects;
+import java.util.function.IntSupplier;
+
+public class CharsInputStream extends InputStream {
+
+	/**
+	 * Functional interface for supplying characters at a specified index.
+	 * Implementations can define how characters are fetched.
+	 */
+	@FunctionalInterface
+	public interface CharsSupplier {
+		char charAt(int index) throws Exception;
+	}
+
+	private enum EncoderState {
+		/**
+		 * The {@link #encoder} is actively encoding characters into bytes. This is the
+		 * initial state of the encoder.
+		 */
+		ENCODING, //
+
+		/**
+		 * The {@link #encoder} has finished processing all characters and is now
+		 * flushing any remaining bytes in its internal buffer.
+		 */
+		FLUSHING, //
+
+		/**
+		 * The {@link #encoder} has completed both the encoding and flushing processes.
+		 * No more data is left to be read from the encoder.
+		 */
+		DONE
+	}
+
+	public static final char UNICODE_REPLACEMENT_CHAR = '\uFFFD';
+
+	/** 512 surrogate character pairs */
+	private static final int DEFAULT_BUFFER_SIZE = 512;
+	private static final int EOF = -1;
+
+	private final int bufferSize;
+	private final CharBuffer charBuffer;
+	private final ByteBuffer byteBuffer;
+	private final CharsetEncoder encoder;
+	private EncoderState encoderState = EncoderState.ENCODING;
+
+	private int charIndex = 0;
+	private final CharsSupplier chars;
+	private final IntSupplier charsLength;
+
+	public CharsInputStream(final CharSequence chars) {
+		this(chars, Charset.defaultCharset());
+	}
+
+	public CharsInputStream(final CharSequence chars, final Charset charset) {
+		this(chars, charset, DEFAULT_BUFFER_SIZE);
+	}
+
+	public CharsInputStream(final CharSequence chars, final Charset charset, final int bufferSize) {
+		this(chars::charAt, chars::length, charset, bufferSize);
+	}
+
+	public CharsInputStream(final CharsSupplier chars, final IntSupplier charsLength) {
+		this(chars, charsLength, Charset.defaultCharset());
+	}
+
+	/**
+	 * @param chars
+	 *            function to access indexed chars.
+	 * @param charsLength
+	 *            function to get the number of indexed chars provided by the
+	 *            <code>chars</code> parameter.
+	 */
+	public CharsInputStream(final CharsSupplier chars, final IntSupplier charsLength, final Charset charset) {
+		this(chars, charsLength, charset, DEFAULT_BUFFER_SIZE);
+	}
+
+	/**
+	 * @param chars
+	 *            function to access indexed chars.
+	 * @param charsLength
+	 *            function to get the number of indexed chars provided by the
+	 *            <code>chars</code> parameter.
+	 * @param bufferSize
+	 *            number of surrogate character pairs to encode at once.
+	 */
+	public CharsInputStream(final CharsSupplier chars, final IntSupplier charsLength, final Charset charset,
+			final int bufferSize) {
+		if (bufferSize < 1)
+			throw new IllegalArgumentException("[bufferSize] must be 1 or larger"); //$NON-NLS-1$
+		encoder = charset.newEncoder();
+
+		this.bufferSize = bufferSize;
+		charBuffer = CharBuffer.allocate(bufferSize * 2); // buffer for 2 chars (high/low surrogate)
+		byteBuffer = ByteBuffer.allocate(bufferSize * 4); // buffer for one UTF character (up to 4 bytes)
+		byteBuffer.flip();
+		charBuffer.flip();
+
+		this.chars = chars;
+		this.charsLength = charsLength;
+	}
+
+	@Override
+	public int available() {
+		final int remaining = byteBuffer.remaining();
+		return remaining == 0 ? charsLength.getAsInt() - charIndex : remaining;
+	}
+
+	/**
+	 * This method is called by {@link #refillByteBuffer()} to encode characters
+	 * from the given {@link CharBuffer} into bytes and stores them in the
+	 * {@link #byteBuffer}.
+	 *
+	 * <p>
+	 * The method can be used either to encode characters in the middle of input
+	 * (with {@code isEndOfInput=false}) or to finalize the encoding process at the
+	 * end of input (with {@code isEndOfInput=true}).
+	 * </p>
+	 *
+	 * @param in
+	 *            the {@link CharBuffer} containing characters to encode.
+	 * @param isEndOfInput
+	 *            if {@code true}, signals that no more input will be provided,
+	 *            allowing the encoder to complete its final encoding steps.
+	 */
+	private void encodeChars(final CharBuffer in, final boolean isEndOfInput) throws CharacterCodingException {
+		byteBuffer.clear();
+		final CoderResult result = encoder.encode(in, byteBuffer, isEndOfInput);
+		byteBuffer.flip();
+		if (result.isError()) {
+			result.throwException();
+		}
+	}
+
+	/**
+	 * Flushes the remaining bytes from the encoder to the {@link #byteBuffer}.
+	 *
+	 * <p>
+	 * This method is called by {@link #refillByteBuffer()} when all characters have
+	 * been processed, and the encoder needs to output any remaining bytes. It
+	 * transitions the encoder state from {@link EncoderState#ENCODING} to
+	 * {@link EncoderState#FLUSHING}, and eventually to {@link EncoderState#DONE}
+	 * once all bytes have been flushed.
+	 * </p>
+	 *
+	 * @return {@code true} if there are still bytes left in the {@link #byteBuffer}
+	 *         after flushing, or if the encoder still has more bytes to flush;
+	 *         {@code false} if the flush is complete and no bytes remain.
+	 */
+	private boolean flushEncoder() throws IOException {
+		if (encoderState == EncoderState.DONE)
+			return false;
+
+		if (encoderState == EncoderState.ENCODING) {
+			encoderState = EncoderState.FLUSHING;
+		}
+
+		// flush
+		byteBuffer.clear();
+		final CoderResult result = encoder.flush(byteBuffer);
+		byteBuffer.flip();
+
+		if (result.isOverflow()) {
+			// the byteBuffer has been filled, but there are more bytes to be flushed.
+			// after reading all available bytes from byteBuffer, flushEncoder() needs to
+			// be called again to process the remaining data.
+			return true;
+		}
+
+		if (result.isError()) {
+			result.throwException();
+		}
+
+		encoderState = EncoderState.DONE;
+		return byteBuffer.hasRemaining();
+	}
+
+	public Charset getCharset() {
+		return encoder.charset();
+	}
+
+	@Override
+	public int read() throws IOException {
+		if (!byteBuffer.hasRemaining() && !refillByteBuffer())
+			return EOF;
+		return byteBuffer.get() & 0xFF; // next byte as an unsigned integer (0 to 255)
+	}
+
+	@Override
+	public int read(final byte[] buf, final int off, final int bytesToRead) throws IOException {
+		Objects.checkFromIndexSize(off, bytesToRead, buf.length);
+		if (bytesToRead == 0)
+			return 0;
+
+		int bytesRead = 0;
+		int bytesReadable = byteBuffer.remaining();
+
+		while (bytesRead < bytesToRead) {
+			if (bytesReadable == 0) {
+				if (refillByteBuffer()) {
+					bytesReadable = byteBuffer.remaining();
+				} else
+					return bytesRead == 0 ? EOF : bytesRead;
+			}
+
+			final int bytesToReadNow = Math.min(bytesToRead - bytesRead, bytesReadable);
+			byteBuffer.get(buf, off + bytesRead, bytesToReadNow);
+			bytesRead += bytesToReadNow;
+			bytesReadable -= bytesToReadNow;
+		}
+
+		return bytesRead;
+	}
+
+	/**
+	 * Refills the {@link #byteBuffer} by reading characters from the character
+	 * supplier, encoding them, and storing the resulting bytes into the
+	 * {@link #byteBuffer}.
+	 *
+	 * @return {@code true} if the buffer was successfully refilled and has bytes
+	 *         available for reading, {@code false} if the end of the stream is
+	 *         reached and there are no more bytes to read.
+	 */
+	private boolean refillByteBuffer() throws IOException {
+		if (encoderState == EncoderState.DONE)
+			return false;
+
+		if (encoderState == EncoderState.FLUSHING)
+			return flushEncoder();
+
+		final int charsLen = charsLength.getAsInt();
+
+		// if EOF is reached transition to flushing
+		if (charIndex >= charsLen) {
+			// finalize encoding before switching to flushing
+			encodeChars(CharBuffer.allocate(0), true /* signal EOF */);
+			return flushEncoder();
+		}
+
+		try {
+			charBuffer.clear();
+			for (int i = 0; i < bufferSize && charIndex < charsLen; i++) {
+				final char nextChar = chars.charAt(charIndex++);
+				if (Character.isHighSurrogate(nextChar)) { // handle surrogate pairs
+					if (charIndex < charsLen) {
+						final char lowSurrogate = chars.charAt(charIndex);
+						if (Character.isLowSurrogate(lowSurrogate)) {
+							charIndex++;
+							charBuffer.put(nextChar);
+							charBuffer.put(lowSurrogate);
+						} else {
+							// missing low surrogate - fallback to replacement character
+							charBuffer.put(UNICODE_REPLACEMENT_CHAR);
+						}
+					} else {
+						// missing low surrogate - fallback to replacement character
+						charBuffer.put(UNICODE_REPLACEMENT_CHAR);
+						break;
+					}
+				} else {
+					charBuffer.put(nextChar);
+				}
+			}
+			charBuffer.flip();
+
+			// encode chars into bytes
+			encodeChars(charBuffer, false);
+		} catch (final Exception ex) {
+			throw new IOException(ex);
+		}
+
+		return true;
+	}
+}
diff --git a/org.eclipse.lsp4e/src/org/eclipse/lsp4e/internal/DocumentUtil.java b/org.eclipse.lsp4e/src/org/eclipse/lsp4e/internal/DocumentUtil.java
index bc7a34b24..4789ff0bf 100644
--- a/org.eclipse.lsp4e/src/org/eclipse/lsp4e/internal/DocumentUtil.java
+++ b/org.eclipse.lsp4e/src/org/eclipse/lsp4e/internal/DocumentUtil.java
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2023 Avaloq Group AG.
+ * Copyright (c) 2023, 2024 Avaloq Group AG and others.
  * This program and the accompanying materials are made
  * available under the terms of the Eclipse Public License 2.0
  * which is available at https://www.eclipse.org/legal/epl-2.0/
@@ -8,14 +8,19 @@
  *
  * Contributors:
  *  Rubén Porras Campo (Avaloq Group AG) - Initial Implementation
+ *  Sebastian Thomschke - add getCharset method
  *******************************************************************************/
 package org.eclipse.lsp4e.internal;
 
+import java.nio.charset.Charset;
+
+import org.eclipse.core.filebuffers.ITextFileBuffer;
 import org.eclipse.core.resources.IFile;
 import org.eclipse.jdt.annotation.Nullable;
 import org.eclipse.jface.text.IDocument;
 import org.eclipse.jface.text.IDocumentExtension4;
 import org.eclipse.lsp4e.LSPEclipseUtils;
+import org.eclipse.lsp4e.LanguageServerPlugin;
 
 public final class DocumentUtil {
 
@@ -45,4 +50,17 @@ public static long getDocumentModificationStamp(@Nullable IDocument document) {
 		return IDocumentExtension4.UNKNOWN_MODIFICATION_STAMP;
 	}
 
+	public static Charset getCharset(final IDocument document) {
+		final ITextFileBuffer buffer = LSPEclipseUtils.toBuffer(document);
+		if (buffer == null)
+			return Charset.defaultCharset();
+		try {
+			final String charsetName = buffer.getEncoding();
+			if (charsetName != null)
+				return Charset.forName(charsetName);
+		} catch (final Exception ex) {
+		   LanguageServerPlugin.logError(ex);
+		}
+		return Charset.defaultCharset();
+	}
 }

From 14d95d4badaa04f2ade3d94eec21610f54bc689a Mon Sep 17 00:00:00 2001
From: Sebastian Thomschke <sebthom@users.noreply.github.com>
Date: Tue, 20 Aug 2024 15:35:06 +0200
Subject: [PATCH 2/2] refact: move DocumentInputStream to internal package

---
 .../eclipse/lsp4e/test/internal/DocumentInputStreamTest.java  | 2 +-
 org.eclipse.lsp4e/src/org/eclipse/lsp4e/LSPEclipseUtils.java  | 1 +
 .../org/eclipse/lsp4e/{ => internal}/DocumentInputStream.java | 4 +---
 3 files changed, 3 insertions(+), 4 deletions(-)
 rename org.eclipse.lsp4e/src/org/eclipse/lsp4e/{ => internal}/DocumentInputStream.java (85%)

diff --git a/org.eclipse.lsp4e.test/src/org/eclipse/lsp4e/test/internal/DocumentInputStreamTest.java b/org.eclipse.lsp4e.test/src/org/eclipse/lsp4e/test/internal/DocumentInputStreamTest.java
index dc1b35854..04f8ef7e9 100644
--- a/org.eclipse.lsp4e.test/src/org/eclipse/lsp4e/test/internal/DocumentInputStreamTest.java
+++ b/org.eclipse.lsp4e.test/src/org/eclipse/lsp4e/test/internal/DocumentInputStreamTest.java
@@ -19,9 +19,9 @@
 
 import org.eclipse.core.runtime.CoreException;
 import org.eclipse.jface.text.IDocument;
-import org.eclipse.lsp4e.DocumentInputStream;
 import org.eclipse.lsp4e.LSPEclipseUtils;
 import org.eclipse.lsp4e.internal.CharsInputStream;
+import org.eclipse.lsp4e.internal.DocumentInputStream;
 import org.eclipse.lsp4e.test.utils.AbstractTestWithProject;
 import org.eclipse.lsp4e.test.utils.TestUtils;
 import org.junit.Before;
diff --git a/org.eclipse.lsp4e/src/org/eclipse/lsp4e/LSPEclipseUtils.java b/org.eclipse.lsp4e/src/org/eclipse/lsp4e/LSPEclipseUtils.java
index 277080281..85d606316 100644
--- a/org.eclipse.lsp4e/src/org/eclipse/lsp4e/LSPEclipseUtils.java
+++ b/org.eclipse.lsp4e/src/org/eclipse/lsp4e/LSPEclipseUtils.java
@@ -90,6 +90,7 @@
 import org.eclipse.jface.text.TextSelection;
 import org.eclipse.jface.viewers.ISelection;
 import org.eclipse.jface.viewers.ISelectionProvider;
+import org.eclipse.lsp4e.internal.DocumentInputStream;
 import org.eclipse.lsp4e.refactoring.CreateFileChange;
 import org.eclipse.lsp4e.refactoring.DeleteExternalFile;
 import org.eclipse.lsp4e.refactoring.LSPTextChange;
diff --git a/org.eclipse.lsp4e/src/org/eclipse/lsp4e/DocumentInputStream.java b/org.eclipse.lsp4e/src/org/eclipse/lsp4e/internal/DocumentInputStream.java
similarity index 85%
rename from org.eclipse.lsp4e/src/org/eclipse/lsp4e/DocumentInputStream.java
rename to org.eclipse.lsp4e/src/org/eclipse/lsp4e/internal/DocumentInputStream.java
index f1b856cbc..b55d32b37 100644
--- a/org.eclipse.lsp4e/src/org/eclipse/lsp4e/DocumentInputStream.java
+++ b/org.eclipse.lsp4e/src/org/eclipse/lsp4e/internal/DocumentInputStream.java
@@ -9,11 +9,9 @@
  * Contributors:
  * Sebastian Thomschke - initial implementation
  *******************************************************************************/
-package org.eclipse.lsp4e;
+package org.eclipse.lsp4e.internal;
 
 import org.eclipse.jface.text.IDocument;
-import org.eclipse.lsp4e.internal.CharsInputStream;
-import org.eclipse.lsp4e.internal.DocumentUtil;
 
 public final class DocumentInputStream extends CharsInputStream {