diff --git a/src/main/java/edu/princeton/cs/algs4/LempelZivWelch.java b/src/main/java/edu/princeton/cs/algs4/LempelZivWelch.java new file mode 100644 index 0000000..ae74ee9 --- /dev/null +++ b/src/main/java/edu/princeton/cs/algs4/LempelZivWelch.java @@ -0,0 +1,156 @@ +/****************************************************************************** + * Compilation: javac LempelZivWelch.java + * Execution: java LempelZivWelch - < input.txt (compress) + * Execution: java LempelZivWelch + < input.txt (expand) + * Dependencies: BinaryIn.java BinaryOut.java + * Data files: https://algs4.cs.princeton.edu/55compression/abraLZW.txt + * https://algs4.cs.princeton.edu/55compression/ababLZW.txt + * + * Compress or expand binary input from standard input using LZW. + * + * + ******************************************************************************/ + +package edu.princeton.cs.algs4; + +import edu.princeton.cs.algs4.TernarySearchTrie; +import edu.princeton.cs.algs4.BinaryStdIn; +import edu.princeton.cs.algs4.BinaryStdOut; + +/** + * The {@code LempelZivWelch} class provides static methods for compressing and expanding a binary + * input using LempelZivWelch compression over the 8-bit extended ASCII alphabet with 12-bit + * codewords. + * + *

Starting with Oracle Java 7u6, the substring method takes time and space linear in the length + * of the extracted substring (instead of constant time an space as in earlier versions). As a + * result, compression takes quadratic time in the original {@code LZW} class. See this article for more + * details. + * + *

This class, along with {@code TernarySearchTrie} - the enhanced version of {@code TST}, fixes + * the above issue.The key to the fix is to use the new method {@code + * TernarySearchTrie.longestPrefixOf(String query, int startIndex)} in {@code compress()}. + * + *

For additional documentation, see Section 5.5 of Algorithms, 4th + * Edition by Robert Sedgewick and Kevin Wayne. + * + * @author Robert Sedgewick + * @author Kevin Wayne + */ +public class LempelZivWelch { + private static final int R = 256; // number of input chars + private static final int L = 4096; // number of codewords = 2^W + private static final int W = 12; // codeword width + + // Do not instantiate. + private LempelZivWelch() {} + + /** + * Reads a sequence of 8-bit bytes from standard input; compresses them using LempelZivWelch + * compression with 12-bit codewords; and writes the results to standard output. + */ + public static void compress() { + String input = BinaryStdIn.readString(); + TernarySearchTrie st = new TernarySearchTrie(); + + // since TernarySearchTrie is not balanced, it would be better to insert in a different order + for (int i = 0; i < R; i++) { + st.put("" + (char) i, i); + } + + int code = R + 1; // R is codeword for EOF + int index = 0; + while (index < input.length()) { + String s = st.longestPrefixOf(input, index); // Find max prefix match s. + BinaryStdOut.write(st.get(s), W); // Print s's encoding. + int t = s.length(); + if (t < input.length() && code < L) { + // Add s to symbol table. + st.put(input.substring(index, index + t + 1), code++); + } + index += t; // Scan past s in input. + } + BinaryStdOut.write(R, W); + BinaryStdOut.close(); + } + + /** + * Reads a sequence of bit encoded using LempelZivWelch compression with 12-bit codewords from + * standard input; expands them; and writes the results to standard output. + */ + public static void expand() { + String[] st = new String[L]; + int i; // next available codeword value + + // initialize symbol table with all 1-character strings + for (i = 0; i < R; i++) { + st[i] = "" + (char) i; + } + st[i++] = ""; // (unused) lookahead for EOF + + int codeword = BinaryStdIn.readInt(W); + if (codeword == R) { + return; // expanded message is empty string + } + String val = st[codeword]; + + while (true) { + BinaryStdOut.write(val); + codeword = BinaryStdIn.readInt(W); + if (codeword == R) { + break; + } + String s = st[codeword]; + if (i == codeword) { // special case hack + s = val + val.charAt(0); + } + if (i < L) { + st[i++] = val + s.charAt(0); + } + val = s; + } + BinaryStdOut.close(); + } + + /** + * Sample client that calls {@code compress()} if the command-line argument is "-" an {@code + * expand()} if it is "+". + * + * @param args the command-line arguments + */ + public static void main(String[] args) { + if (args[0].equals("-")) { + compress(); + } else if (args[0].equals("+")) { + expand(); + } else { + throw new IllegalArgumentException("Illegal command line argument"); + } + } +} + +/****************************************************************************** + * Copyright 2002-2020, Robert Sedgewick and Kevin Wayne. + * + * This file is part of algs4.jar, which accompanies the textbook + * + * Algorithms, 4th edition by Robert Sedgewick and Kevin Wayne, + * Addison-Wesley Professional, 2011, ISBN 0-321-57351-X. + * http://algs4.cs.princeton.edu + * + * + * algs4.jar is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * algs4.jar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with algs4.jar. If not, see http://www.gnu.org/licenses. + ******************************************************************************/ diff --git a/src/main/java/edu/princeton/cs/algs4/TernarySearchTrie.java b/src/main/java/edu/princeton/cs/algs4/TernarySearchTrie.java new file mode 100644 index 0000000..d56fb5d --- /dev/null +++ b/src/main/java/edu/princeton/cs/algs4/TernarySearchTrie.java @@ -0,0 +1,381 @@ +/****************************************************************************** + * Compilation: javac TernarySearchTrie.java + * Execution: java TernarySearchTrie < words.txt + * Dependencies: StdIn.java + * Data files: https://algs4.cs.princeton.edu/52trie/shellsST.txt + * + * Symbol table with string keys, implemented using a ternary search + * trie (TernarySearchTrie). + * + * + * % java TernarySearchTrie < shellsST.txt + * keys(""): + * by 4 + * sea 6 + * sells 1 + * she 0 + * shells 3 + * shore 7 + * the 5 + * + * longestPrefixOf("shellsort"): + * shells + * + * keysWithPrefix("shor"): + * shore + * + * keysThatMatch(".he.l."): + * shells + * + * % java TernarySearchTrie + * theory the now is the time for all good men + * + * Remarks + * -------- + * - can't use a key that is the empty string "" + * + ******************************************************************************/ + +package edu.princeton.cs.algs4; + +import edu.princeton.cs.algs4.Queue; +import edu.princeton.cs.algs4.StdIn; +import edu.princeton.cs.algs4.StdOut; + +/** + * The {@code TernarySearchTrie} class represents an symbol table of key-value pairs, with string + * keys and generic values. It supports the usual put, get, contains, + * delete, size, and is-empty methods. It also provides character-based + * methods for finding the string in the symbol table that is the longest prefix of a given + * prefix, finding all strings in the symbol table that start with a given prefix, and + * finding all strings in the symbol table that match a given pattern. A symbol table + * implements the associative array abstraction: when associating a value with a key that + * is already in the symbol table, the convention is to replace the old value with the new value. + * Unlike {@link java.util.Map}, this class uses the convention that values cannot be {@code + * null}—setting the value associated with a key to {@code null} is equivalent to deleting the key + * from the symbol table. + * + *

This implementation uses a ternary search trie. + * + *

For additional documentation, see Section + * 5.2 of Algorithms, 4th Edition by Robert Sedgewick and Kevin Wayne. + */ +public class TernarySearchTrie { + private int size; // size + private Node root; // root of TernarySearchTrie + + private static class Node { + private char ch; // character + private Node left; + private Node mid; + private Node right; + private T val; // value associated with string + } + + /** Initializes an empty string symbol table. */ + public TernarySearchTrie() {} + + /** + * Returns the number of key-value pairs in this symbol table. + * + * @return the number of key-value pairs in this symbol table + */ + public int size() { + return size; + } + + /** + * check if this symbol table contains the given key. + * + * @param key the key + * @return {@code true} if this symbol table contains {@code key} and {@code false} otherwise + * @throws IllegalArgumentException if {@code key} is {@code null} + */ + public boolean contains(String key) { + if (key == null) { + throw new IllegalArgumentException("argument to contains() is null"); + } + return get(key) != null; + } + + /** + * Returns the value associated with the given key. + * + * @param key the key + * @return the value associated with the given key if the key is in the symbol table and {@code + * null} if the key is not in the symbol table + * @throws IllegalArgumentException if {@code key} is {@code null} + */ + public T get(String key) { + if (key == null) { + throw new IllegalArgumentException("calls get() with null argument"); + } + if (key.length() == 0) { + throw new IllegalArgumentException("key must have length >= 1"); + } + Node x = get(root, key, 0); + if (x == null) { + return null; + } + return x.val; + } + + // return subtrie corresponding to given key + private Node get(Node x, String key, int d) { + if (x == null) { + return null; + } + if (key.length() == 0) { + throw new IllegalArgumentException("key must have length >= 1"); + } + char c = key.charAt(d); + if (c < x.ch) { + return get(x.left, key, d); + } else if (c > x.ch) { + return get(x.right, key, d); + } else if (d < key.length() - 1) { + return get(x.mid, key, d + 1); + } else { + return x; + } + } + + /** + * Inserts the key-value pair into the symbol table, overwriting the old value with the new value + * if the key is already in the symbol table. If the value is {@code null}, this effectively + * deletes the key from the symbol table. + * + * @param key the key + * @param val the value + * @throws IllegalArgumentException if {@code key} is {@code null} + */ + public void put(String key, T val) { + if (key == null) { + throw new IllegalArgumentException("calls put() with null key"); + } + if (!contains(key)) { + size++; + } else if (val == null) { // delete existing key + size--; + } + root = put(root, key, val, 0); + } + + private Node put(Node x, String key, T val, int d) { + char c = key.charAt(d); + if (x == null) { + x = new Node(); + x.ch = c; + } + if (c < x.ch) { + x.left = put(x.left, key, val, d); + } else if (c > x.ch) { + x.right = put(x.right, key, val, d); + } else if (d < key.length() - 1) { + x.mid = put(x.mid, key, val, d + 1); + } else { + x.val = val; + } + return x; + } + + /** + * Returns the string in the symbol table that is the longest prefix of {@code query}, or {@code + * null}, if no such string. + * + * @param query the query string + * @return the string in the symbol table that is the longest prefix of {@code query}, or {@code + * null} if no such string + * @throws IllegalArgumentException if {@code query} is {@code null} + */ + public String longestPrefixOf(String query) { + return longestPrefixOf(query, 0); + } + + /** + * Returns the string in the symbol table that is the longest prefix of {@code query}, or {@code + * null}, if no such string. + * + * @param query the query string + * @param startIndex the start index in query string + * @return the string in the symbol table that is the longest prefix of {@code query}, or {@code + * null} if no such string + * @throws IllegalArgumentException if {@code query} is {@code null} + */ + public String longestPrefixOf(String query, int startIndex) { + if (query == null || startIndex < 0 || startIndex >= query.length()) { + throw new IllegalArgumentException("calls longestPrefixOf() with wrong arguments"); + } + int length = 0; + Node x = root; + int i = 0; + while (x != null && i + startIndex < query.length()) { + char c = query.charAt(i + startIndex); + if (c < x.ch) { + x = x.left; + } else if (c > x.ch) { + x = x.right; + } else { + i++; + if (x.val != null) { + length = i; + } + x = x.mid; + } + } + return query.substring(startIndex, startIndex + length); + } + + /** + * Returns all keys in the symbol table as an {@code Iterable}. To iterate over all of the keys in + * the symbol table named {@code st}, use the foreach notation: {@code for (Key key : st.keys())}. + * + * @return all keys in the symbol table as an {@code Iterable} + */ + public Iterable keys() { + Queue queue = new Queue(); + collect(root, new StringBuilder(), queue); + return queue; + } + + /** + * Returns all of the keys in the set that start with {@code prefix}. + * + * @param prefix the prefix + * @return all of the keys in the set that start with {@code prefix}, as an iterable + * @throws IllegalArgumentException if {@code prefix} is {@code null} + */ + public Iterable keysWithPrefix(String prefix) { + if (prefix == null) { + throw new IllegalArgumentException("calls keysWithPrefix() with null argument"); + } + Queue queue = new Queue(); + Node x = get(root, prefix, 0); + if (x == null) { + return queue; + } + if (x.val != null) { + queue.enqueue(prefix); + } + collect(x.mid, new StringBuilder(prefix), queue); + return queue; + } + + // all keys in subtrie rooted at x with given prefix + private void collect(Node x, StringBuilder prefix, Queue queue) { + if (x == null) { + return; + } + collect(x.left, prefix, queue); + if (x.val != null) { + queue.enqueue(prefix.toString() + x.ch); + } + collect(x.mid, prefix.append(x.ch), queue); + prefix.deleteCharAt(prefix.length() - 1); + collect(x.right, prefix, queue); + } + + private void collect( + Node x, StringBuilder prefix, int i, String pattern, Queue queue) { + if (x == null) { + return; + } + char c = pattern.charAt(i); + if (c == '.' || c < x.ch) { + collect(x.left, prefix, i, pattern, queue); + } + if (c == '.' || c == x.ch) { + if (i == pattern.length() - 1 && x.val != null) { + queue.enqueue(prefix.toString() + x.ch); + } + if (i < pattern.length() - 1) { + collect(x.mid, prefix.append(x.ch), i + 1, pattern, queue); + prefix.deleteCharAt(prefix.length() - 1); + } + } + if (c == '.' || c > x.ch) { + collect(x.right, prefix, i, pattern, queue); + } + } + + /** + * Returns all of the keys in the symbol table that match {@code pattern}, where the character '.' + * is interpreted as a wildcard character. + * + * @param pattern the pattern + * @return all of the keys in the symbol table that match {@code pattern}, as an iterable, where . + * is treated as a wildcard character. + */ + public Iterable keysThatMatch(String pattern) { + Queue queue = new Queue(); + collect(root, new StringBuilder(), 0, pattern, queue); + return queue; + } + + /** + * Unit tests the {@code TernarySearchTrie} data type. + * + * @param args the command-line arguments + */ + public static void main(String[] args) { + + // build symbol table from standard input + TernarySearchTrie st = new TernarySearchTrie(); + for (int i = 0; !StdIn.isEmpty(); i++) { + String key = StdIn.readString(); + st.put(key, i); + } + + // print results + if (st.size() < 100) { + StdOut.println("keys(\"\"):"); + for (String key : st.keys()) { + StdOut.println(key + " " + st.get(key)); + } + StdOut.println(); + } + + StdOut.println("longestPrefixOf(\"shellsort\"):"); + StdOut.println(st.longestPrefixOf("shellsort")); + StdOut.println(); + + StdOut.println("longestPrefixOf(\"shell\"):"); + StdOut.println(st.longestPrefixOf("shell")); + StdOut.println(); + + StdOut.println("keysWithPrefix(\"shor\"):"); + for (String s : st.keysWithPrefix("shor")) { + StdOut.println(s); + } + StdOut.println(); + + StdOut.println("keysThatMatch(\".he.l.\"):"); + for (String s : st.keysThatMatch(".he.l.")) { + StdOut.println(s); + } + } +} + +/****************************************************************************** + * Copyright 2002-2020, Robert Sedgewick and Kevin Wayne. + * + * This file is part of algs4.jar, which accompanies the textbook + * + * Algorithms, 4th edition by Robert Sedgewick and Kevin Wayne, + * Addison-Wesley Professional, 2011, ISBN 0-321-57351-X. + * http://algs4.cs.princeton.edu + * + * + * algs4.jar is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * algs4.jar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with algs4.jar. If not, see http://www.gnu.org/licenses. + ******************************************************************************/ diff --git a/tools/bwt_compress.sh b/tools/bwt_compress.sh index 64ca37c..4e2b78b 100755 --- a/tools/bwt_compress.sh +++ b/tools/bwt_compress.sh @@ -1,4 +1,4 @@ -# !/bin/bash +#!/bin/bash cd `dirname $0`/.. FILE_NAME=$1 diff --git a/tools/huf_compress.sh b/tools/huf_compress.sh index 8a63310..f099fd4 100755 --- a/tools/huf_compress.sh +++ b/tools/huf_compress.sh @@ -1,4 +1,4 @@ -# !/bin/bash +#!/bin/bash cd `dirname $0`/.. FILE_NAME=$1 diff --git a/tools/lzw.sh b/tools/lzw.sh new file mode 100755 index 0000000..3e339cc --- /dev/null +++ b/tools/lzw.sh @@ -0,0 +1,46 @@ +#!/bin/bash +cd `dirname $0`/.. + +FILE_NAME=$1 +CLASS_NAME=$2 + +mysize=$(stat -f%z "$FILE_NAME") +echo "${FILE_NAME} size: ${mysize} bytes" + +start_ms=$(ruby -e 'puts (Time.now.to_f * 1000).to_i') + +if [ "$CLASS_NAME" == "" ] +then + CLASS_NAME=LempelZivWelch +fi + +tools/run.sh edu.princeton.cs.algs4.$CLASS_NAME - < $FILE_NAME | \ + tools/run.sh edu.princeton.cs.algs4.HexDump 64 | tail -1 + +end_ms=$(ruby -e 'puts (Time.now.to_f * 1000).to_i') +elapsed_ms=$((end_ms - start_ms)) +echo "$elapsed_ms ms used" + +# based on improved LZW ==> LempelZivWelch +# tools/lzw.sh src/test/data/burrows/dickens_512K.txt +# src/test/data/burrows/dickens_512K.txt size: 512000 bytes +# 2018344 bits +# 1825 ms used + +# based on original LZW of algs4 book +# tools/lzw.sh src/test/data/burrows/dickens_512K.txt LZW +# src/test/data/burrows/dickens_512K.txt size: 512000 bytes +# 2018344 bits +# 4723 ms used + +# irb(main):001:0> 2018344.0/512000 +# => 3.942078125 + +# based on improved LZW ==> LempelZivWelch +# algs4 % tools/lzw.sh src/test/data/burrows/dickens.txt +# src/test/data/burrows/dickens.txt size: 28965453 bytes +# 118099584 bits +# 39963 ms used + +# irb(main):002:0> 118099584.0/28965453 +# => 4.077256585629785 diff --git a/tools/lzw_compress.sh b/tools/lzw_compress.sh index 4a04156..ab90f5f 100755 --- a/tools/lzw_compress.sh +++ b/tools/lzw_compress.sh @@ -1,4 +1,4 @@ -# !/bin/bash +#!/bin/bash cd `dirname $0`/.. FILE_NAME=$1