Skip to content

Commit ce6f731

Browse files
authored
feat: Add RLE and Shannon-Fano compression algorithms (#6779)
* feat: Add RLE and Shannon-Fano compression algorithms * Fix: Resolve CI failures for compression algorithms * chore: trigger CI rebuild
1 parent b50d1d0 commit ce6f731

File tree

4 files changed

+407
-0
lines changed

4 files changed

+407
-0
lines changed
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
package com.thealgorithms.compression;
2+
3+
/**
4+
* An implementation of the Run-Length Encoding (RLE) algorithm.
5+
*
6+
* <p>Run-Length Encoding is a simple form of lossless data compression in which
7+
* runs of data (sequences in which the same data value occurs in many
8+
* consecutive data elements) are stored as a single data value and count,
9+
* rather than as the original run.
10+
*
11+
* <p>This implementation provides methods for both compressing and decompressing
12+
* a string. For example:
13+
* <ul>
14+
* <li>Compressing "AAAABBBCCDAA" results in "4A3B2C1D2A".</li>
15+
* <li>Decompressing "4A3B2C1D2A" results in "AAAABBBCCDAA".</li>
16+
* </ul>
17+
*
18+
* <p>Time Complexity: O(n) for both compression and decompression, where n is the
19+
* length of the input string.
20+
*
21+
* <p>References:
22+
* <ul>
23+
* <li><a href="https://en.wikipedia.org/wiki/Run-length_encoding">Wikipedia: Run-length encoding</a></li>
24+
* </ul>
25+
*/
26+
public final class RunLengthEncoding {
27+
28+
/**
29+
* Private constructor to prevent instantiation of this utility class.
30+
*/
31+
private RunLengthEncoding() {
32+
}
33+
34+
/**
35+
* Compresses a string using the Run-Length Encoding algorithm.
36+
*
37+
* @param text The string to be compressed. Must not be null.
38+
* @return The compressed string. Returns an empty string if the input is empty.
39+
*/
40+
public static String compress(String text) {
41+
if (text == null || text.isEmpty()) {
42+
return "";
43+
}
44+
45+
StringBuilder compressed = new StringBuilder();
46+
int count = 1;
47+
48+
for (int i = 0; i < text.length(); i++) {
49+
// Check if it's the last character or if the next character is different
50+
if (i == text.length() - 1 || text.charAt(i) != text.charAt(i + 1)) {
51+
compressed.append(count);
52+
compressed.append(text.charAt(i));
53+
count = 1; // Reset count for the new character
54+
} else {
55+
count++;
56+
}
57+
}
58+
return compressed.toString();
59+
}
60+
61+
/**
62+
* Decompresses a string that was compressed using the Run-Length Encoding algorithm.
63+
*
64+
* @param compressedText The compressed string. Must not be null.
65+
* @return The original, uncompressed string.
66+
*/
67+
public static String decompress(String compressedText) {
68+
if (compressedText == null || compressedText.isEmpty()) {
69+
return "";
70+
}
71+
72+
StringBuilder decompressed = new StringBuilder();
73+
int count = 0;
74+
75+
for (char ch : compressedText.toCharArray()) {
76+
if (Character.isDigit(ch)) {
77+
// Build the number for runs of 10 or more (e.g., "12A")
78+
count = count * 10 + ch - '0';
79+
} else {
80+
// Append the character 'count' times
81+
decompressed.append(String.valueOf(ch).repeat(Math.max(0, count)));
82+
count = 0; // Reset count for the next sequence
83+
}
84+
}
85+
return decompressed.toString();
86+
}
87+
}
Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
package com.thealgorithms.compression;
2+
3+
import java.util.ArrayList;
4+
import java.util.Collections;
5+
import java.util.HashMap;
6+
import java.util.List;
7+
import java.util.Map;
8+
import java.util.stream.Collectors;
9+
10+
/**
11+
* An implementation of the Shannon-Fano algorithm for generating prefix codes.
12+
*
13+
* <p>Shannon-Fano coding is an entropy encoding technique for lossless data
14+
* compression. It assigns variable-length codes to symbols based on their
15+
* frequencies of occurrence. It is a precursor to Huffman coding and works by
16+
* recursively partitioning a sorted list of symbols into two sub-lists with
17+
* nearly equal total frequencies.
18+
*
19+
* <p>The algorithm works as follows:
20+
* <ol>
21+
* <li>Count the frequency of each symbol in the input data.</li>
22+
* <li>Sort the symbols in descending order of their frequencies.</li>
23+
* <li>Recursively divide the list of symbols into two parts with sums of
24+
* frequencies as close as possible to each other.</li>
25+
* <li>Assign a '0' bit to the codes in the first part and a '1' bit to the codes
26+
* in the second part.</li>
27+
* <li>Repeat the process for each part until a part contains only one symbol.</li>
28+
* </ol>
29+
*
30+
* <p>Time Complexity: O(n^2) in this implementation due to the partitioning logic,
31+
* or O(n log n) if a more optimized partitioning strategy is used.
32+
* Sorting takes O(n log n), where n is the number of unique symbols.
33+
*
34+
* <p>References:
35+
* <ul>
36+
* <li><a href="https://en.wikipedia.org/wiki/Shannonâ€"Fano_coding">Wikipedia: Shannonâ€"Fano coding</a></li>
37+
* </ul>
38+
*/
39+
public final class ShannonFano {
40+
41+
/**
42+
* Private constructor to prevent instantiation of this utility class.
43+
*/
44+
private ShannonFano() {
45+
}
46+
47+
/**
48+
* A private inner class to represent a symbol and its frequency.
49+
* Implements Comparable to allow sorting based on frequency.
50+
*/
51+
private static class Symbol implements Comparable<Symbol> {
52+
final char character;
53+
final int frequency;
54+
String code = "";
55+
56+
Symbol(char character, int frequency) {
57+
this.character = character;
58+
this.frequency = frequency;
59+
}
60+
61+
@Override
62+
public int compareTo(Symbol other) {
63+
return Integer.compare(other.frequency, this.frequency); // Sort descending
64+
}
65+
}
66+
67+
/**
68+
* Generates Shannon-Fano codes for the symbols in a given text.
69+
*
70+
* @param text The input string for which to generate codes. Must not be null.
71+
* @return A map where keys are characters and values are their corresponding Shannon-Fano codes.
72+
*/
73+
public static Map<Character, String> generateCodes(String text) {
74+
if (text == null || text.isEmpty()) {
75+
return Collections.emptyMap();
76+
}
77+
78+
Map<Character, Integer> frequencyMap = new HashMap<>();
79+
for (char c : text.toCharArray()) {
80+
frequencyMap.put(c, frequencyMap.getOrDefault(c, 0) + 1);
81+
}
82+
83+
List<Symbol> symbols = new ArrayList<>();
84+
for (Map.Entry<Character, Integer> entry : frequencyMap.entrySet()) {
85+
symbols.add(new Symbol(entry.getKey(), entry.getValue()));
86+
}
87+
88+
Collections.sort(symbols);
89+
90+
// Special case: only one unique symbol
91+
if (symbols.size() == 1) {
92+
symbols.getFirst().code = "0";
93+
} else {
94+
buildCodeTree(symbols, 0, symbols.size() - 1, "");
95+
}
96+
97+
return symbols.stream().collect(Collectors.toMap(s -> s.character, s -> s.code));
98+
}
99+
100+
/**
101+
* Recursively builds the Shannon-Fano code tree by partitioning the list of symbols.
102+
* Uses index-based approach to avoid sublist creation issues.
103+
*
104+
* @param symbols The sorted list of symbols to be processed.
105+
* @param start The start index of the current partition.
106+
* @param end The end index of the current partition (inclusive).
107+
* @param prefix The current prefix code being built for the symbols in this partition.
108+
*/
109+
private static void buildCodeTree(List<Symbol> symbols, int start, int end, String prefix) {
110+
// The initial check in generateCodes ensures start <= end is always true here.
111+
// The base case is when a partition has only one symbol.
112+
if (start == end) {
113+
symbols.get(start).code = prefix;
114+
return;
115+
}
116+
117+
// Find the optimal split point
118+
int splitIndex = findSplitIndex(symbols, start, end);
119+
120+
// Recursively process left and right partitions with updated prefixes
121+
buildCodeTree(symbols, start, splitIndex, prefix + "0");
122+
buildCodeTree(symbols, splitIndex + 1, end, prefix + "1");
123+
}
124+
125+
/**
126+
* Finds the index that splits the range into two parts with the most balanced frequency sums.
127+
* This method tries every possible split point and returns the index that minimizes the
128+
* absolute difference between the two partition sums.
129+
*
130+
* @param symbols The sorted list of symbols.
131+
* @param start The start index of the range.
132+
* @param end The end index of the range (inclusive).
133+
* @return The index of the last element in the first partition.
134+
*/
135+
private static int findSplitIndex(List<Symbol> symbols, int start, int end) {
136+
// Calculate total frequency for the entire range
137+
long totalFrequency = 0;
138+
for (int i = start; i <= end; i++) {
139+
totalFrequency += symbols.get(i).frequency;
140+
}
141+
142+
long leftSum = 0;
143+
long minDifference = Long.MAX_VALUE;
144+
int splitIndex = start;
145+
146+
// Try every possible split point and find the one with minimum difference
147+
for (int i = start; i < end; i++) {
148+
leftSum += symbols.get(i).frequency;
149+
long rightSum = totalFrequency - leftSum;
150+
long difference = Math.abs(leftSum - rightSum);
151+
152+
if (difference < minDifference) {
153+
minDifference = difference;
154+
splitIndex = i;
155+
}
156+
}
157+
return splitIndex;
158+
}
159+
}
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
package com.thealgorithms.compression;
2+
3+
import static org.junit.jupiter.api.Assertions.assertEquals;
4+
5+
import org.junit.jupiter.api.Test;
6+
7+
class RunLengthEncodingTest {
8+
9+
@Test
10+
void testNullInputs() {
11+
// Test that a null input to compress returns an empty string
12+
assertEquals("", RunLengthEncoding.compress(null));
13+
14+
// Test that a null input to decompress returns an empty string
15+
assertEquals("", RunLengthEncoding.decompress(null));
16+
}
17+
18+
@Test
19+
void testCompressionSimple() {
20+
// Test a typical string with multiple runs
21+
String input = "AAAABBBCCDAA";
22+
String expected = "4A3B2C1D2A";
23+
assertEquals(expected, RunLengthEncoding.compress(input));
24+
}
25+
26+
@Test
27+
void testCompressionWithNoRuns() {
28+
// Test a string with no consecutive characters
29+
String input = "ABCDE";
30+
String expected = "1A1B1C1D1E";
31+
assertEquals(expected, RunLengthEncoding.compress(input));
32+
}
33+
34+
@Test
35+
void testCompressionEdgeCases() {
36+
// Test with an empty string
37+
assertEquals("", RunLengthEncoding.compress(""));
38+
39+
// Test with a single character
40+
assertEquals("1A", RunLengthEncoding.compress("A"));
41+
42+
// Test with a long run of a single character
43+
assertEquals("10Z", RunLengthEncoding.compress("ZZZZZZZZZZ"));
44+
}
45+
46+
@Test
47+
void testDecompressionSimple() {
48+
// Test decompression of a typical RLE string
49+
String input = "4A3B2C1D2A";
50+
String expected = "AAAABBBCCDAA";
51+
assertEquals(expected, RunLengthEncoding.decompress(input));
52+
}
53+
54+
@Test
55+
void testDecompressionWithNoRuns() {
56+
// Test decompression of a string with single characters
57+
String input = "1A1B1C1D1E";
58+
String expected = "ABCDE";
59+
assertEquals(expected, RunLengthEncoding.decompress(input));
60+
}
61+
62+
@Test
63+
void testDecompressionWithMultiDigitCount() {
64+
// Test decompression where a run count is greater than 9
65+
String input = "12A1B3C";
66+
String expected = "AAAAAAAAAAAABCCC";
67+
assertEquals(expected, RunLengthEncoding.decompress(input));
68+
}
69+
70+
@Test
71+
void testDecompressionEdgeCases() {
72+
// Test with an empty string
73+
assertEquals("", RunLengthEncoding.decompress(""));
74+
75+
// Test with a single character run
76+
assertEquals("A", RunLengthEncoding.decompress("1A"));
77+
}
78+
79+
@Test
80+
void testSymmetry() {
81+
// Test that compressing and then decompressing returns the original string
82+
String original1 = "WWWWWWWWWWWWBWWWWWWWWWWWWBBBWWWWWWWWWWWWWWWWWWWWWWWWB";
83+
String compressed = RunLengthEncoding.compress(original1);
84+
String decompressed = RunLengthEncoding.decompress(compressed);
85+
assertEquals(original1, decompressed);
86+
87+
String original2 = "A";
88+
assertEquals(original2, RunLengthEncoding.decompress(RunLengthEncoding.compress(original2)));
89+
}
90+
}

0 commit comments

Comments
 (0)