概述 在计算机数据处理中,哈夫曼编码使用变长编码表 对源符号(如文件中的一个字母)进行编码,其中变长编码表是通过一种评估来源符号出现机率的方法得到的,出现机率高的字母使用较短的编码,反之出现机率低的则使用较长的编码,这便使编码之后的字符串的平均长度、期望值降低,从而达到无损压缩数据的目的。
例如,在英文中,e的出现机率最高,而z的出现概率则最低。当利用哈夫曼编码对一篇英文进行压缩时,e极有可能用一个比特来表示,而z则可能花去25个比特(不是26)。用普通的表示方法时,每个英文字母均占用一个字节,即8个比特。二者相比,e使用了一般编码的1/8的长度,z则使用了3倍多。倘若我们能实现对于英文中各个字母出现概率的较准确的估算,就可以大幅度提高无损压缩的比例。
哈夫曼树又称最优二叉树,是一种带权路径长度最短的二叉树 。所谓树的带权路径长度,就是树中所有的叶结点的权值乘上其到根结点的路径长度(若根结点为0层,叶结点到根结点的路径长度为叶结点的层数)。树的路径长度是从树根到每一结点的路径长度之和,记为WPL=(W1L1+W2 L2+W3L3+…+Wn Ln),N个权值Wi(i=1,2,…n)构成一棵有N个叶结点的二叉树,相应的叶结点的路径长度为Li(i=1,2,…n)。可以证明哈夫曼树的WPL是最小的。
基本术语 哈夫曼树又称为最优树.
1、路径和路径长度
在一棵树中,从一个结点往下可以达到的孩子或孙子结点之间的通路,称为路径。通路中分支的数目称为路径长度。若规定根结点的层数为1,则从根结点到第L层结点的路径长度为L-1。
2、结点的权及带权路径长度
若将树中结点赋给一个有着某种含义的数值,则这个数值称为该结点的权。结点的带权路径长度为:从根结点到该结点之间的路径长度与该结点的权的乘积。
3、树的带权路径长度
树的带权路径长度规定为所有叶子结点的带权路径长度之和,记为WPL。
算法思想 假设有n个权值,则构造出的哈夫曼树有n个叶子结点。 n个权值分别设为 w1、w2、…、wn,则哈夫曼树的构造规则为:
将w1、w2、…,wn看成是有n 棵树的森林(每棵树仅有一个结点);
在森林中选出两个根结点的权值最小的树合并,作为一棵新树的左、右子树,且新树的根结点权值为其左、右子树根结点权值之和;
从森林中删除选取的两棵树,并将新树加入森林;
重复2、3步,直到森林中只剩一棵树为止,该树即为所求得的哈夫曼树。
代码实现 C++实现:
#include <iostream> #include <iomanip> using namespace std ;typedef struct { int weight; int parent, lchild, rchild; } HTNode, *HuffmanTree; struct HCode { int code[256 ]; int start; } hcode[1025 ]; int n, m;void selectMin (HuffmanTree tree, int n, int &s1, int &s2) { s1 = s2 = 0 ; int i; for (i = 1 ; i < n; i++) { if (tree[i].parent == 0 ) { if (s1 == 0 ) s1 = i; else { s2 = i; break ; } } } if (tree[s1].weight > tree[s2].weight) swap(s1, s2); for (i += 1 ; i < n; i++) { if (tree[i].parent == 0 ) { if (tree[i].weight < tree[s1].weight) { s2 = s1; s1 = i; } else if (tree[i].weight < tree[s2].weight) s2 = i; } } } void createHuffmanTree (HuffmanTree &tree) { cin >> n; m = 2 * n - 1 ; tree = new HTNode[m + 1 ]; for (int i = 1 ; i <= m; i++) tree[i].parent = tree[i].lchild = tree[i].rchild = 0 ; for (int i = 1 ; i <= n; i++) cin >> tree[i].weight; tree[0 ].weight = m; for (int i = n + 1 ; i <= m; i++) { int s1, s2; selectMin(tree, i, s1, s2); tree[s1].parent = tree[s2].parent = i; tree[i].lchild = s1; tree[i].rchild = s2; tree[i].weight = tree[s1].weight + tree[s2].weight; } } void print (HuffmanTree tree) { cout << "index weight parent lchild rchild" << endl ; cout << left; for (int i = 1 , m = tree[0 ].weight; i <= m; ++i) { cout << setw(5 ) << i << " " ; cout << setw(6 ) << tree[i].weight << " " ; cout << setw(6 ) << tree[i].parent << " " ; cout << setw(6 ) << tree[i].lchild << " " ; cout << setw(6 ) << tree[i].rchild << endl ; } } int WPL_ (HuffmanTree tree, int i, int depth) { if (tree[i].lchild == 0 && tree[i].rchild == 0 ) return tree[i].weight * depth; else return WPL_(tree, tree[i].lchild, depth + 1 ) + WPL_(tree, tree[i].rchild, depth + 1 ); } int WPL (HuffmanTree tree) { return WPL_(tree, tree[0 ].weight, 0 ); } void createHCode (HuffmanTree tree, HCode code[]) { int father, child; for (int i = 1 ; i <= n; i++) { HCode hc; hc.start = n; child = i; father = tree[i].parent; while (father) { if (tree[father].lchild == child) hc.code[hc.start] = 0 ; else hc.code[hc.start] = 1 ; hc.start--; child = father; father = tree[father].parent; } hc.start++; code[i] = hc; } } void printCode (HCode hcode[]) { for (int i = 1 ; i <= n; i++) { cout << "node " << i << ": " ; for (int j = hcode[i].start; j <= n; j++) cout << hcode[i].code[j]; cout << endl ; } } int main () { HuffmanTree tree; createHuffmanTree(tree); createHCode(tree, hcode); print(tree); printCode(hcode); cout << "WPL=" << WPL(tree) << endl ; delete [] tree; return 0 ; }
优化 C++:
#include <iostream> #include <queue> #include <iomanip> using namespace std ;typedef struct Node { int index; int weight; int parent, lchild, rchild; bool operator <(const Node &x) const { return weight > x.weight; } } * HuffmanTree; void createHuffmanTree (HuffmanTree &tree) { int n, m; priority_queue <Node> q; cin >> n; m = 2 * n - 1 ; tree = new Node[m + 1 ]; for (int i = 1 ; i <= m; i++) { tree[i].index = i; tree[i].parent = tree[i].lchild = tree[i].rchild = 0 ; } for (int i = 1 ; i <= n; i++) { cin >> tree[i].weight; q.push(tree[i]); } tree[0 ].weight = m; for (int i = n + 1 ; i <= m; i++) { int s1, s2; s1 = q.top().index; q.pop(); s2 = q.top().index; q.pop(); tree[i].index = i; tree[s1].parent = tree[s2].parent = i; tree[i].lchild = s1; tree[i].rchild = s2; tree[i].weight = tree[s1].weight + tree[s2].weight; q.push(tree[i]); } } void print (HuffmanTree tree) { cout << "index weight parent lchild rchild" << endl ; cout << left; for (int i = 1 , m = tree[0 ].weight; i <= m; ++i) { cout << setw(5 ) << i << " " ; cout << setw(6 ) << tree[i].weight << " " ; cout << setw(6 ) << tree[i].parent << " " ; cout << setw(6 ) << tree[i].lchild << " " ; cout << setw(6 ) << tree[i].rchild << endl ; } } int WPL_ (HuffmanTree tree, int i, int depth) { if (tree[i].lchild == 0 && tree[i].rchild == 0 ) return tree[i].weight * depth; else return WPL_(tree, tree[i].lchild, depth + 1 ) + WPL_(tree, tree[i].rchild, depth + 1 ); } int WPL (HuffmanTree tree) { return WPL_(tree, tree[0 ].weight, 0 ); } int main () { HuffmanTree tree; createHuffmanTree(tree); print(tree); cout << "WPL=" << WPL(tree) << endl ; delete [] tree; return 0 ; }