
前言你有没有想过一个1GB的文件为什么压缩后能变成500MBZIP、gzip、PNG这些格式背后是什么算法数据压缩分为两类· 无损压缩解压后和原文件完全一样文本、程序· 有损压缩解压后有一些信息丢失图片、音频今天我们用C语言从零实现两种经典无损压缩算法· Huffman编码熵编码按字符频率变长编码· LZ77字典编码找重复字符串---一、Huffman编码原理1. 核心思想出现频率高的字符用短编码频率低的用长编码字符: A(5次) B(3次) C(2次) D(1次)固定编码: 00 01 10 11 (2位)Huffman编码:A: 0 (1位)B: 10 (2位)C: 110 (3位)D: 111 (3位)2. Huffman树构造(11)/ \(5) (6)/ / \A(5) (3) C(2)/ \B(2) D(1)---二、完整代码实现1. 基础数据结构c#include stdio.h#include stdlib.h#include string.h#include math.h#include stdint.h#define MAX_SYMBOLS 256#define MAX_CODE_LEN 256#define WINDOW_SIZE 4096// Huffman节点typedef struct huffman_node {unsigned char symbol;int frequency;struct huffman_node *left;struct huffman_node *right;struct huffman_node *next;} huffman_node_t;// Huffman编码typedef struct huffman_code {unsigned char symbol;char code[MAX_CODE_LEN];int len;} huffman_code_t;// 编码表typedef struct huffman_table {huffman_code_t codes[MAX_SYMBOLS];int count;} huffman_table_t;2. Huffman编码实现c// 构建频率表void build_frequency_table(const unsigned char *data, int len, int *freq) {memset(freq, 0, sizeof(int) * MAX_SYMBOLS);for (int i 0; i len; i) {freq[data[i]];}}// 创建Huffman节点huffman_node_t *create_huffman_node(unsigned char symbol, int freq) {huffman_node_t *node malloc(sizeof(huffman_node_t));node-symbol symbol;node-frequency freq;node-left NULL;node-right NULL;node-next NULL;return node;}// 插入节点到优先队列按频率升序void insert_node(huffman_node_t **head, huffman_node_t *node) {if (!*head || node-frequency (*head)-frequency) {node-next *head;*head node;return;}huffman_node_t *cur *head;while (cur-next cur-next-frequency node-frequency) {cur cur-next;}node-next cur-next;cur-next node;}// 构建Huffman树huffman_node_t *build_huffman_tree(int *freq) {huffman_node_t *head NULL;// 创建叶子节点for (int i 0; i MAX_SYMBOLS; i) {if (freq[i] 0) {huffman_node_t *node create_huffman_node(i, freq[i]);insert_node(head, node);}}if (!head) return NULL;// 合并节点while (head-next) {huffman_node_t *left head;huffman_node_t *right head-next;head right-next;huffman_node_t *parent create_huffman_node(0, left-frequency right-frequency);parent-left left;parent-right right;insert_node(head, parent);}return head;}// 生成Huffman编码void generate_codes(huffman_node_t *node, char *code, int depth, huffman_code_t *codes) {if (!node) return;if (!node-left !node-right) {strcpy(codes[node-symbol].code, code);codes[node-symbol].len depth;codes[node-symbol].symbol node-symbol;return;}if (node-left) {code[depth] 0;generate_codes(node-left, code, depth 1, codes);}if (node-right) {code[depth] 1;generate_codes(node-right, code, depth 1, codes);}}// Huffman编码压缩uint8_t *huffman_encode(const unsigned char *input, int input_len, int *output_len) {int freq[MAX_SYMBOLS] {0};build_frequency_table(input, input_len, freq);huffman_node_t *tree build_huffman_tree(freq);if (!tree) return NULL;huffman_code_t codes[MAX_SYMBOLS] {0};char code_buf[MAX_CODE_LEN];generate_codes(tree, code_buf, 0, codes);// 计算总比特数int total_bits 0;for (int i 0; i MAX_SYMBOLS; i) {if (freq[i] 0) {total_bits freq[i] * codes[i].len;}}// 分配输出压缩后数据 频率表(1024字节) 编码数据int header_size MAX_SYMBOLS * sizeof(int);int data_size (total_bits 7) / 8;*output_len header_size data_size;uint8_t *output malloc(*output_len);// 写入频率表用于解码memcpy(output, freq, header_size);// 编码数据uint8_t *bit_buffer output header_size;int bit_pos 0;for (int i 0; i input_len; i) {unsigned char c input[i];char *code codes[c].code;int len codes[c].len;for (int j 0; j len; j) {if (code[j] 1) {bit_buffer[bit_pos / 8] | (1 (bit_pos % 8));}bit_pos;}}free(tree);return output;}// Huffman解码unsigned char *huffman_decode(const uint8_t *input, int input_len, int *output_len) {int freq[MAX_SYMBOLS] {0};int header_size MAX_SYMBOLS * sizeof(int);// 读取频率表memcpy(freq, input, header_size);// 重建Huffman树huffman_node_t *tree build_huffman_tree(freq);if (!tree) return NULL;// 计算原始数据长度int total_chars 0;for (int i 0; i MAX_SYMBOLS; i) {total_chars freq[i];}unsigned char *output malloc(total_chars);int out_idx 0;// 解码const uint8_t *bit_buffer input header_size;int bit_pos 0;int total_bits (input_len - header_size) * 8;while (out_idx total_chars bit_pos total_bits) {huffman_node_t *node tree;while (node-left || node-right) {int bit (bit_buffer[bit_pos / 8] (bit_pos % 8)) 1;bit_pos;node bit ? node-right : node-left;}output[out_idx] node-symbol;}*output_len out_idx;free(tree);return output;}3. LZ77压缩c// LZ77压缩typedef struct lz77_token {uint16_t distance; // 距离uint16_t length; // 长度unsigned char next; // 下一个字符} lz77_token_t;// LZ77压缩uint8_t *lz77_encode(const unsigned char *input, int input_len, int *output_len) {int max_distance WINDOW_SIZE;int min_match 3;// 估计最大token数int max_tokens input_len / min_match 1;lz77_token_t *tokens malloc(sizeof(lz77_token_t) * max_tokens);int token_count 0;int pos 0;while (pos input_len) {int best_len 0;int best_dist 0;// 查找最长匹配int search_start pos - max_distance;if (search_start 0) search_start 0;for (int i search_start; i pos; i) {int len 0;while (len max_distance pos len input_len input[i len] input[pos len]) {len;}if (len best_len len min_match) {best_len len;best_dist pos - i;}}if (best_len min_match) {tokens[token_count].distance best_dist;tokens[token_count].length best_len;tokens[token_count].next (pos best_len input_len) ?input[pos best_len] : 0;pos best_len 1;} else {tokens[token_count].distance 0;tokens[token_count].length 0;tokens[token_count].next input[pos];pos;}token_count;}// 序列化*output_len token_count * 4 4;uint8_t *output malloc(*output_len);memcpy(output, token_count, 4);for (int i 0; i token_count; i) {output[4 i * 4] tokens[i].distance 0xFF;output[5 i * 4] (tokens[i].distance 8) 0xFF;output[6 i * 4] tokens[i].length 0xFF;output[7 i * 4] tokens[i].next;}free(tokens);return output;}4. 测试代码cvoid test_huffman() {printf( Huffman编码测试 \n\n);char *text this is a test string for huffman encoding compression test;int len strlen(text);printf(原文: %s\n, text);printf(原文大小: %d 字节\n, len);int encoded_len;uint8_t *encoded huffman_encode((unsigned char*)text, len, encoded_len);printf(压缩后大小: %d 字节\n, encoded_len);printf(压缩率: %.2f%%\n, (float)encoded_len / len * 100);int decoded_len;unsigned char *decoded huffman_decode(encoded, encoded_len, decoded_len);printf(解压后大小: %d 字节\n, decoded_len);printf(解压后: %s\n, decoded);printf(原始解压: %s\n,strcmp(text, (char*)decoded) 0 ? ✅ 成功 : ❌ 失败);free(encoded);free(decoded);}void test_lz77() {printf(\n LZ77压缩测试 \n\n);char *text abcabcabcabcabcabcabcabcabcabc;int len strlen(text);printf(原文: %s\n, text);printf(原文大小: %d 字节\n, len);int encoded_len;uint8_t *encoded lz77_encode((unsigned char*)text, len, encoded_len);printf(压缩后大小: %d 字节\n, encoded_len);printf(压缩率: %.2f%%\n, (float)encoded_len / len * 100);free(encoded);}int main() {test_huffman();test_lz77();return 0;}---三、编译和运行bashgcc -o compression compression.c -lm./compression---四、压缩算法对比算法 原理 压缩率 速度 适用场景Huffman 变长编码 中等 快 通用LZ77 字典匹配 高 中 文本LZMA LZ77Huffman 很高 慢 7zDEFLATE LZ77Huffman 高 中 gzip, zip---五、总结通过这篇文章你学会了· Huffman编码频率表、Huffman树、变长编码· LZ77算法滑动窗口、最长匹配· 两种算法的完整实现· 压缩率的计算数据压缩是存储和传输的核心技术。掌握它你就理解了ZIP、gzip、PNG的底层原理。下一篇预告《从零实现一个加密库AES与RSA》---评论区分享一下你对数据压缩的理解