Uso:
Código:
paq9a a "saliente.paq9a" -0...9 "entrante.archivo"
Compilación:
Código:
@path C:\MinGw\bin;%path%
g++ -fpermissive -O3 -s paq9a.cpp -o paq9a
pause
Código
#include <stdio.h> #include <stdlib.h> #include <string.h> #include <time.h> #include <ctype.h> #define NDEBUG // remove for debugging #include <assert.h> int allocated=0; // Total memory allocated by alloc() // Create an array p of n elements of type T template <class T> void alloc(T*&p, int n) { p=(T*)calloc(n, sizeof(T)); if (!p) printf("Out of memory\n"), exit(1); allocated+=n*sizeof(T); } // 8, 16, 32 bit unsigned types (adjust as appropriate) typedef unsigned char U8; typedef unsigned short U16; typedef unsigned int U32; ///////////////////////////// Squash ////////////////////////////// // return p = 1/(1 + exp(-d)), d scaled by 8 bits, p scaled by 12 bits class Squash { short t[4096]; public: Squash(); int operator()(int p) const { if (p>2047) return 4095; if (p<-2047) return 0; return t[p+2048]; } } squash; Squash::Squash() { int ts[33]={1,2,3,6,10,16,27,45,73,120,194,310,488,747,1101, 1546,2047,2549,2994,3348,3607,3785,3901,3975,4022, 4050,4068,4079,4085,4089,4092,4093,4094}; int w,d; for (int i=-2047; i<=2047; ++i){ w=i&127; d=(i>>7)+16; t[i+2048]=(ts[d]*(128-w)+ts[(d+1)]*w+64) >> 7; } } //////////////////////////// Stretch /////////////////////////////// // Inverse of squash. stretch(d) returns ln(p/(1-p)), d scaled by 8 bits, // p by 12 bits. d has range -2047 to 2047 representing -8 to 8. // p has range 0 to 4095 representing 0 to 1. class Stretch { short t[4096]; public: Stretch(); int operator()(int p) const { assert(p>=0 && p<4096); return t[p]; } } stretch; Stretch::Stretch() { int pi=0; for (int x=-2047; x<=2047; ++x) { // invert squash() int i=squash(x); for (int j=pi; j<=i; ++j) t[j]=x; pi=i+1; } t[4095]=2047; } ///////////////////////////// ilog ////////////////////////////// // ilog(x) = round(log2(x) * 16), 0 <= x < 64K class Ilog { U8* t; public: int operator()(U16 x) const {return t[x];} Ilog(); } ilog; // Compute lookup table by numerical integration of 1/x Ilog::Ilog(): t(65536) { U32 x=14155776; for (int i=2; i<65536; ++i) { x+=774541002/(i*2-1); // numerator is 2^29/ln 2 t[i]=x>>24; } } // llog(x) accepts 32 bits inline int llog(U32 x) { if (x>=0x1000000) return 256+ilog(x>>16); else if (x>=0x10000) return 128+ilog(x>>8); else return ilog(x); } ///////////////////////// state table //////////////////////// // State table: // nex(state, 0) = next state if bit y is 0, 0 <= state < 256 // nex(state, 1) = next state if bit y is 1 // // States represent a bit history within some context. // State 0 is the starting state (no bits seen). // States 1-30 represent all possible sequences of 1-4 bits. // States 31-252 represent a pair of counts, (n0,n1), the number // of 0 and 1 bits respectively. If n0+n1 < 16 then there are // two states for each pair, depending on if a 0 or 1 was the last // bit seen. // If n0 and n1 are too large, then there is no state to represent this // pair, so another state with about the same ratio of n0/n1 is substituted. // Also, when a bit is observed and the count of the opposite bit is large, // then part of this count is discarded to favor newer data over old. static const U8 State_table[256][4]={ { 1, 2, 0, 0},{ 3, 5, 1, 0},{ 4, 6, 0, 1},{ 7, 10, 2, 0}, // 0-3 { 8, 12, 1, 1},{ 9, 13, 1, 1},{ 11, 14, 0, 2},{ 15, 19, 3, 0}, // 4-7 { 16, 23, 2, 1},{ 17, 24, 2, 1},{ 18, 25, 2, 1},{ 20, 27, 1, 2}, // 8-11 { 21, 28, 1, 2},{ 22, 29, 1, 2},{ 26, 30, 0, 3},{ 31, 33, 4, 0}, // 12-15 { 32, 35, 3, 1},{ 32, 35, 3, 1},{ 32, 35, 3, 1},{ 32, 35, 3, 1}, // 16-19 { 34, 37, 2, 2},{ 34, 37, 2, 2},{ 34, 37, 2, 2},{ 34, 37, 2, 2}, // 20-23 { 34, 37, 2, 2},{ 34, 37, 2, 2},{ 36, 39, 1, 3},{ 36, 39, 1, 3}, // 24-27 { 36, 39, 1, 3},{ 36, 39, 1, 3},{ 38, 40, 0, 4},{ 41, 43, 5, 0}, // 28-31 { 42, 45, 4, 1},{ 42, 45, 4, 1},{ 44, 47, 3, 2},{ 44, 47, 3, 2}, // 32-35 { 46, 49, 2, 3},{ 46, 49, 2, 3},{ 48, 51, 1, 4},{ 48, 51, 1, 4}, // 36-39 { 50, 52, 0, 5},{ 53, 43, 6, 0},{ 54, 57, 5, 1},{ 54, 57, 5, 1}, // 40-43 { 56, 59, 4, 2},{ 56, 59, 4, 2},{ 58, 61, 3, 3},{ 58, 61, 3, 3}, // 44-47 { 60, 63, 2, 4},{ 60, 63, 2, 4},{ 62, 65, 1, 5},{ 62, 65, 1, 5}, // 48-51 { 50, 66, 0, 6},{ 67, 55, 7, 0},{ 68, 57, 6, 1},{ 68, 57, 6, 1}, // 52-55 { 70, 73, 5, 2},{ 70, 73, 5, 2},{ 72, 75, 4, 3},{ 72, 75, 4, 3}, // 56-59 { 74, 77, 3, 4},{ 74, 77, 3, 4},{ 76, 79, 2, 5},{ 76, 79, 2, 5}, // 60-63 { 62, 81, 1, 6},{ 62, 81, 1, 6},{ 64, 82, 0, 7},{ 83, 69, 8, 0}, // 64-67 { 84, 71, 7, 1},{ 84, 71, 7, 1},{ 86, 73, 6, 2},{ 86, 73, 6, 2}, // 68-71 { 44, 59, 5, 3},{ 44, 59, 5, 3},{ 58, 61, 4, 4},{ 58, 61, 4, 4}, // 72-75 { 60, 49, 3, 5},{ 60, 49, 3, 5},{ 76, 89, 2, 6},{ 76, 89, 2, 6}, // 76-79 { 78, 91, 1, 7},{ 78, 91, 1, 7},{ 80, 92, 0, 8},{ 93, 69, 9, 0}, // 80-83 { 94, 87, 8, 1},{ 94, 87, 8, 1},{ 96, 45, 7, 2},{ 96, 45, 7, 2}, // 84-87 { 48, 99, 2, 7},{ 48, 99, 2, 7},{ 88,101, 1, 8},{ 88,101, 1, 8}, // 88-91 { 80,102, 0, 9},{103, 69,10, 0},{104, 87, 9, 1},{104, 87, 9, 1}, // 92-95 {106, 57, 8, 2},{106, 57, 8, 2},{ 62,109, 2, 8},{ 62,109, 2, 8}, // 96-99 { 88,111, 1, 9},{ 88,111, 1, 9},{ 80,112, 0,10},{113, 85,11, 0}, // 100-103 {114, 87,10, 1},{114, 87,10, 1},{116, 57, 9, 2},{116, 57, 9, 2}, // 104-107 { 62,119, 2, 9},{ 62,119, 2, 9},{ 88,121, 1,10},{ 88,121, 1,10}, // 108-111 { 90,122, 0,11},{123, 85,12, 0},{124, 97,11, 1},{124, 97,11, 1}, // 112-115 {126, 57,10, 2},{126, 57,10, 2},{ 62,129, 2,10},{ 62,129, 2,10}, // 116-119 { 98,131, 1,11},{ 98,131, 1,11},{ 90,132, 0,12},{133, 85,13, 0}, // 120-123 {134, 97,12, 1},{134, 97,12, 1},{136, 57,11, 2},{136, 57,11, 2}, // 124-127 { 62,139, 2,11},{ 62,139, 2,11},{ 98,141, 1,12},{ 98,141, 1,12}, // 128-131 { 90,142, 0,13},{143, 95,14, 0},{144, 97,13, 1},{144, 97,13, 1}, // 132-135 { 68, 57,12, 2},{ 68, 57,12, 2},{ 62, 81, 2,12},{ 62, 81, 2,12}, // 136-139 { 98,147, 1,13},{ 98,147, 1,13},{100,148, 0,14},{149, 95,15, 0}, // 140-143 {150,107,14, 1},{150,107,14, 1},{108,151, 1,14},{108,151, 1,14}, // 144-147 {100,152, 0,15},{153, 95,16, 0},{154,107,15, 1},{108,155, 1,15}, // 148-151 {100,156, 0,16},{157, 95,17, 0},{158,107,16, 1},{108,159, 1,16}, // 152-155 {100,160, 0,17},{161,105,18, 0},{162,107,17, 1},{108,163, 1,17}, // 156-159 {110,164, 0,18},{165,105,19, 0},{166,117,18, 1},{118,167, 1,18}, // 160-163 {110,168, 0,19},{169,105,20, 0},{170,117,19, 1},{118,171, 1,19}, // 164-167 {110,172, 0,20},{173,105,21, 0},{174,117,20, 1},{118,175, 1,20}, // 168-171 {110,176, 0,21},{177,105,22, 0},{178,117,21, 1},{118,179, 1,21}, // 172-175 {110,180, 0,22},{181,115,23, 0},{182,117,22, 1},{118,183, 1,22}, // 176-179 {120,184, 0,23},{185,115,24, 0},{186,127,23, 1},{128,187, 1,23}, // 180-183 {120,188, 0,24},{189,115,25, 0},{190,127,24, 1},{128,191, 1,24}, // 184-187 {120,192, 0,25},{193,115,26, 0},{194,127,25, 1},{128,195, 1,25}, // 188-191 {120,196, 0,26},{197,115,27, 0},{198,127,26, 1},{128,199, 1,26}, // 192-195 {120,200, 0,27},{201,115,28, 0},{202,127,27, 1},{128,203, 1,27}, // 196-199 {120,204, 0,28},{205,115,29, 0},{206,127,28, 1},{128,207, 1,28}, // 200-203 {120,208, 0,29},{209,125,30, 0},{210,127,29, 1},{128,211, 1,29}, // 204-207 {130,212, 0,30},{213,125,31, 0},{214,137,30, 1},{138,215, 1,30}, // 208-211 {130,216, 0,31},{217,125,32, 0},{218,137,31, 1},{138,219, 1,31}, // 212-215 {130,220, 0,32},{221,125,33, 0},{222,137,32, 1},{138,223, 1,32}, // 216-219 {130,224, 0,33},{225,125,34, 0},{226,137,33, 1},{138,227, 1,33}, // 220-223 {130,228, 0,34},{229,125,35, 0},{230,137,34, 1},{138,231, 1,34}, // 224-227 {130,232, 0,35},{233,125,36, 0},{234,137,35, 1},{138,235, 1,35}, // 228-231 {130,236, 0,36},{237,125,37, 0},{238,137,36, 1},{138,239, 1,36}, // 232-235 {130,240, 0,37},{241,125,38, 0},{242,137,37, 1},{138,243, 1,37}, // 236-239 {130,244, 0,38},{245,135,39, 0},{246,137,38, 1},{138,247, 1,38}, // 240-243 {140,248, 0,39},{249,135,40, 0},{250, 69,39, 1},{ 80,251, 1,39}, // 244-247 {140,252, 0,40},{249,135,41, 0},{250, 69,40, 1},{ 80,251, 1,40}, // 248-251 {140,252, 0,41}}; // 252, 253-255 are reserved #define nex(state,sel) State_table[state][sel] //////////////////////////// StateMap ////////////////////////// // A StateMap maps a context to a probability. Methods: // // Statemap sm(n) creates a StateMap with n contexts using 4*n bytes memory. // sm.p(cx, limit) converts state cx (0..n-1) to a probability (0..4095) // that the next updated bit y=1. // limit (1..1023, default 255) is the maximum count for computing a // prediction. Larger values are better for stationary sources. // sm.update(y) updates the model with actual bit y (0..1). class StateMap { protected: const int N; // Number of contexts int cxt; // Context of last prediction U32 *t; // cxt -> prediction in high 22 bits, count in low 10 bits static int dt[1024]; // i -> 16K/(i+3) public: StateMap(int n=256); // update bit y (0..1) void update(int y, int limit=255) { assert(cxt>=0 && cxt<N); int n=t[cxt]&1023, p=t[cxt]>>10; // count, prediction if (n<limit) ++t[cxt]; else t[cxt]=t[cxt]&0xfffffc00|limit; t[cxt]+=(((y<<22)-p)>>3)*dt[n]&0xfffffc00; } // predict next bit in context cx int p(int cx) { assert(cx>=0 && cx<N); return t[cxt=cx]>>20; } }; int StateMap::dt[1024]={0}; StateMap::StateMap(int n): N(n), cxt(0) { alloc(t, N); for (int i=0; i<N; ++i) t[i]=1<<31; if (dt[0]==0) for (int i=0; i<1024; ++i) dt[i]=16384/(i+i+3); } //////////////////////////// Mix, APM ///////////////////////// // Mix combines 2 predictions and a context to produce a new prediction. // Methods: // Mix m(n) -- creates allowing with n contexts. // m.pp(p1, p2, cx) -- inputs 2 stretched predictions and a context cx // (0..n-1) and returns a stretched prediction. Stretched predictions // are fixed point numbers with an 8 bit fraction, normally -2047..2047 // representing -8..8, such that 1/(1+exp(-p) is the probability that // the next update will be 1. // m.update(y) updates the model after a prediction with bit y (0..1). class Mix { protected: const int N; // n int* wt; // weights, scaled 24 bits int x1, x2; // inputs, scaled 8 bits (-2047 to 2047) int cxt; // last context (0..n-1) int pr; // last output public: Mix(int n=512); int pp(int p1, int p2, int cx) { assert(cx>=0 && cx<N); cxt=cx*2; return pr=(x1=p1)*(wt[cxt]>>16)+(x2=p2)*(wt[cxt+1]>>16)+128>>8; } void update(int y) { assert(y==0 || y==1); int err=((y<<12)-squash(pr)); if ((wt[cxt]&3)<3) err*=4-(++wt[cxt]&3); err=err+8>>4; wt[cxt]+=x1*err&-4; wt[cxt+1]+=x2*err; } }; Mix::Mix(int n): N(n), x1(0), x2(0), cxt(0), pr(0) { alloc(wt, n*2); for (int i=0; i<N*2; ++i) wt[i]=1<<23; } // An APM is a Mix optimized for a constant in place of p1, used to // refine a stretched prediction given a context cx. // Normally p1 is in the range (0..4095) and p2 is doubled. class APM: public Mix { public: APM(int n); }; APM::APM(int n): Mix(n) { for (int i=0; i<n; ++i) wt[2*i]=0; } //////////////////////////// HashTable ///////////////////////// // A HashTable maps a 32-bit index to an array of B bytes. // The first byte is a checksum using the upper 8 bits of the // index. The second byte is a priority (0 = empty) for hash // replacement. The index need not be a hash. // HashTable<B> h(n) - create using n bytes n and B must be // powers of 2 with n >= B*4, and B >= 2. // h[i] returns array [1..B-1] of bytes indexed by i, creating and // replacing another element if needed. Element 0 is the // checksum and should not be modified. template <int B> class HashTable { U8* t; // table: 1 element = B bytes: checksum priority data data const U32 N; // size in bytes public: HashTable(int n); ~HashTable(); U8* operator[](U32 i); }; template <int B> HashTable<B>::HashTable(int n): t(0), N(n) { assert(B>=2 && (B&B-1)==0); assert(N>=B*4 && (N&N-1)==0); alloc(t, N+B*4+64); t+=64-int(((long)t)&63); // align on cache line boundary } template <int B> inline U8* HashTable<B>::operator[](U32 i) { i*=123456791; i=i<<16|i>>16; i*=234567891; int chk=i>>24; i=i*B&N-B; if (t[i]==chk) return t+i; if (t[i^B]==chk) return t+(i^B); if (t[i^B*2]==chk) return t+(i^B*2); if (t[i+1]>t[i+1^B] || t[i+1]>t[i+1^B*2]) i^=B; if (t[i+1]>t[i+1^B^B*2]) i^=B^B*2; memset(t+i, 0, B); t[i]=chk; return t+i; } template <int B> HashTable<B>::~HashTable() { int c=0, c0=0; for (U32 i=0; i<N; ++i) { if (t[i]) { ++c; if (i%B==0) ++c0; } } printf("HashTable<%d> %1.4f%% full, %1.4f%% utilized of %d KiB\n", B, 100.0*c0*B/N, 100.0*c/N, N>>10); } ////////////////////////// LZP ///////////////////////// U32 MEM=1<<29; // Global memory limit, 1 << 22+(memory option) // LZP predicts the next byte and maintains context. Methods: // c() returns the predicted byte for the next update, or -1 if none. // p() returns the 12 bit probability (0..4095) that c() is next. // update(ch) updates the model with actual byte ch (0..255). // c(i) returns the i'th prior byte of context, i > 0. // c4() returns the order 4 context, shifted into the LSB. // c8() returns a hash of the order 8 context, shifted 4 bits into LSB. // word0, word1 are hashes of the current and previous word (a-z). class LZP { private: const int N, H; // buf, t sizes enum {MINLEN=12}; // minimum match length U8* buf; // Rotating buffer of size N U32* t; // hash table of pointers in high 24 bits, state in low 8 bits int match; // start of match int len; // length of match int pos; // position of next ch to write to buf U32 h; // context hash U32 h1; // hash of last 8 byte updates, shifting 4 bits to MSB U32 h2; // last 4 updates, shifting 8 bits to MSB StateMap sm1; // len+offset -> p APM a1, a2, a3; // p, context -> p int literals, matches; // statistics public: U32 word0, word1; // hashes of last 2 words (case insensitive a-z) LZP(); ~LZP(); int c(); // predicted char int c(int i);// context int c4() {return h2;} // order 4 context, c(1) in LSB int c8() {return h1;} // hashed order 8 context int p(); // probability that next char is c() * 4096 void update(int ch); // update model with actual char ch }; // Initialize LZP::LZP(): N(MEM/8), H(MEM/32), match(-1), len(0), pos(0), h(0), h1(0), h2(0), sm1(0x200), a1(0x10000), a2(0x40000), a3(0x100000), literals(0), matches(0), word0(0), word1(0) { assert(MEM>0); assert(H>0); alloc(buf, N); alloc(t, H); } // Print statistics LZP::~LZP() { int c=0; for (int i=0; i<H; ++i) c+=(t[i]!=0); printf("LZP hash table %1.4f%% full of %d KiB\n" "LZP buffer %1.4f%% full of %d KiB\n", 100.0*c/H, H>>8, pos<N?100.0*pos/N:100.0, N>>10); printf("LZP %d literals, %d matches (%1.4f%% matched)\n", literals, matches, literals+matches>0?100.0*matches/(literals+matches):0.0); } // Predicted next byte, or -1 for no prediction inline int LZP::c() { return len>=MINLEN ? buf[match&N-1] : -1; } // Return i'th byte of context (i > 0) inline int LZP::c(int i) { assert(i>0); return buf[pos-i&N-1]; } // Return prediction that c() will be the next byte (0..4095) int LZP::p() { if (len<MINLEN) return 0; int cxt=len; if (len>28) cxt=28+(len>=32)+(len>=64)+(len>=128); int pc=c(); int pr=sm1.p(cxt); pr=stretch(pr); pr=a1.pp(2048, pr*2, h2*256+pc&0xffff)*3+pr>>2; pr=a2.pp(2048, pr*2, h1*(11<<6)+pc&0x3ffff)*3+pr>>2; pr=a3.pp(2048, pr*2, h1*(7<<4)+pc&0xfffff)*3+pr>>2; pr=squash(pr); return pr; } // Update model with predicted byte ch (0..255) void LZP::update(int ch) { int y=c()==ch; // 1 if prediction of ch was right, else 0 h1=h1*(3<<4)+ch+1; // update context hashes h2=h2<<8|ch; h=h*(5<<2)+ch+1&H-1; if (len>=MINLEN) { sm1.update(y); a1.update(y); a2.update(y); a3.update(y); } if (isalpha(ch)) word0=word0*(29<<2)+tolower(ch); else if (word0) word1=word0, word0=0; buf[pos&N-1]=ch; // update buf ++pos; if (y) { // extend match ++len; ++match; ++matches; } else { // find new match, try order 6 context first ++literals; y=0; len=1; match=t[h]; if (!((match^pos)&N-1)) --match; while (len<=128 && buf[match-len&N-1]==buf[pos-len&N-1]) ++len; --len; } t[h]=pos; } LZP* lzp=0; //////////////////////////// Predictor ///////////////////////// // A Predictor estimates the probability that the next bit of // uncompressed data is 1. Methods: // Predictor() creates. // p() returns P(1) as a 12 bit number (0-4095). // update(y) trains the predictor with the actual bit (0 or 1). class Predictor { enum {N=11}; // number of contexts int c0; // last 0-7 bits with leading 1, 0 before LZP flag int nibble; // last 0-3 bits with leading 1 (1..15) int bcount; // number of bits in c0 (0..7) HashTable<16> t; // context -> state StateMap sm[N]; // state -> prediction U8* cp[N]; // i -> state array of bit histories for i'th context U8* sp[N]; // i -> pointer to bit history for i'th context Mix m[N-1]; // combines 2 predictions given a context APM a1, a2, a3; // adjusts a prediction given a context U8* t2; // order 1 contexts -> state public: Predictor(); int p(); void update(int y); }; // Initialize Predictor::Predictor(): c0(0), nibble(1), bcount(0), t(MEM/2), a1(0x10000), a2(0x10000), a3(0x10000) { alloc(t2, 0x40000); for (int i=0; i<N; ++i) sp[i]=cp[i]=t2; } // Update model void Predictor::update(int y) { assert(y==0 || y==1); assert(bcount>=0 && bcount<8); assert(c0>=0 && c0<256); assert(nibble>=1 && nibble<=15); if (c0==0) c0=1-y; else { *sp[0]=nex(*sp[0], y); sm[0].update(y); for (int i=1; i<N; ++i) { *sp[i]=nex(*sp[i], y); sm[i].update(y); m[i-1].update(y); } c0+=c0+y; if (++bcount==8) bcount=c0=0; if ((nibble+=nibble+y)>=16) nibble=1; a1.update(y); a2.update(y); a3.update(y); } } // Predict next bit int Predictor::p() { assert(lzp); if (c0==0) return lzp->p(); else { // Set context pointers int pc=lzp->c(); // mispredicted byte int r=pc+256>>8-bcount==c0; // c0 consistent with mispredicted byte? U32 c4=lzp->c4(); // last 4 whole context bytes, shifted into LSB U32 c8=(lzp->c8()<<4)-1; // hash of last 7 bytes with 4 trailing 1 bits if ((bcount&3)==0) { // nibble boundary? Update context pointers pc&=-r; U32 c4p=c4<<8; if (bcount==0) { // byte boundary? Update order-1 context pointers cp[0]=t2+(c4>>16&0xff00); cp[1]=t2+(c4>>8 &0xff00)+0x10000; cp[2]=t2+(c4 &0xff00)+0x20000; cp[3]=t2+(c4<<8 &0xff00)+0x30000; } cp[4]=t[(c4p&0xffff00)-c0]; cp[5]=t[(c4p&0xffffff00)*3+c0]; cp[6]=t[c4*7+c0]; cp[7]=t[(c8*5&0xfffffc)+c0]; cp[8]=t[(c8*11&0xffffff0)+c0+pc*13]; cp[9]=t[lzp->word0*5+c0+pc*17]; cp[10]=t[lzp->word1*7+lzp->word0*11+c0+pc*37]; } // Mix predictions r<<=8; sp[0]=&cp[0][c0]; int pr=stretch(sm[0].p(*sp[0])); for (int i=1; i<N; ++i) { sp[i]=&cp[i][i<4?c0:nibble]; int st=*sp[i]; pr=m[i-1].pp(pr, stretch(sm[i].p(st)), st+r)*3+pr>>2; } pr=a1.pp(512, pr*2, c0+pc*256&0xffff)*3+pr>>2; // Adjust prediction pr=a2.pp(512, pr*2, c4<<8&0xff00|c0)*3+pr>>2; pr=a3.pp(512, pr*2, c4*3+c0&0xffff)*3+pr>>2; return squash(pr); } } Predictor* predictor=0; /////////////////////////// get4, put4 ////////////////////////// // Read/write a 4 byte big-endian number int get4(FILE* in) { int r=getc(in); r=r*256+getc(in); r=r*256+getc(in); r=r*256+getc(in); return r; } void put4(U32 c, FILE* out) { fprintf(out, "%c%c%c%c", c>>24, c>>16, c>>8, c); } //////////////////////////// Encoder //////////////////////////// // An Encoder arithmetic codes in blocks of size BUFSIZE. Methods: // Encoder(COMPRESS, f) creates encoder for compression to archive f, which // must be open past any header for writing in binary mode. // Encoder(DECOMPRESS, f) creates encoder for decompression from archive f, // which must be open past any header for reading in binary mode. // code(i) in COMPRESS mode compresses bit i (0 or 1) to file f. // code() in DECOMPRESS mode returns the next decompressed bit from file f. // count() should be called after each byte is compressed. // flush() should be called after compression is done. It is also called // automatically when a block is written. typedef enum {COMPRESS, DECOMPRESS} Mode; class Encoder { private: const Mode mode; // Compress or decompress? FILE* archive; // Compressed data file U32 x1, x2; // Range, initially [0, 1), scaled by 2^32 U32 x; // Decompress mode: last 4 input bytes of archive enum {BUFSIZE=0x20000}; static unsigned char* buf; // Compression output buffer, size BUFSIZE int usize, csize; // Buffered uncompressed and compressed sizes double usum, csum; // Total of usize, csize public: Encoder(Mode m, FILE* f); void flush(); // call this when compression is finished // Compress bit y or return decompressed bit int code(int y=0) { assert(predictor); int p=predictor->p(); assert(p>=0 && p<4096); p+=p<2048; U32 xmid=x1 + (x2-x1>>12)*p + ((x2-x1&0xfff)*p>>12); assert(xmid>=x1 && xmid<x2); if (mode==DECOMPRESS) y=x<=xmid; y ? (x2=xmid) : (x1=xmid+1); predictor->update(y); while (((x1^x2)&0xff000000)==0) { // pass equal leading bytes of range if (mode==COMPRESS) buf[csize++]=x2>>24; x1<<=8; x2=(x2<<8)+255; if (mode==DECOMPRESS) x=(x<<8)+getc(archive); } return y; } // Count one byte void count() { assert(mode==COMPRESS); ++usize; if (csize>BUFSIZE-256) flush(); } }; unsigned char* Encoder::buf=0; // Create in mode m (COMPRESS or DECOMPRESS) with f opened as the archive. Encoder::Encoder(Mode m, FILE* f): mode(m), archive(f), x1(0), x2(0xffffffff), x(0), usize(0), csize(0), usum(0), csum(0) { if (mode==DECOMPRESS) { // x = first 4 bytes of archive for (int i=0; i<4; ++i) x=(x<<8)+(getc(archive)&255); csize=4; } else if (!buf) alloc(buf, BUFSIZE); } // Write a compressed block and reinitialize the encoder. The format is: // uncompressed size (usize, 4 byte, MSB first) // compressed size (csize, 4 bytes, MSB first) // compressed data (csize bytes) void Encoder::flush() { if (mode==COMPRESS) { buf[csize++]=x1>>24; buf[csize++]=255; buf[csize++]=255; buf[csize++]=255; putc(0, archive); putc('c', archive); put4(usize, archive); put4(csize, archive); fwrite(buf, 1, csize, archive); usum+=usize; csum+=csize+10; printf("%15.0f -> %15.0f" "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b", usum, csum); x1=x=usize=csize=0; x2=0xffffffff; } } /////////////////////////// paq9a //////////////////////////////// // Compress or decompress from in to out, depending on whether mode // is COMPRESS or DECOMPRESS. A byte c is encoded as a 1 bit if it // is predicted by LZP, otherwise a 0 followed by 8 bits from MSB to LSB. void paq9a(FILE* in, FILE* out, Mode mode) { if (!lzp && !predictor) { lzp=new LZP; predictor=new Predictor; printf("%8d KiB\b\b\b\b\b\b\b\b\b\b\b\b", allocated>>10); } if (mode==COMPRESS) { Encoder e(COMPRESS, out); int c; while ((c=getc(in))!=EOF) { int cp=lzp->c(); if (c==cp) e.code(1); else for (int i=8; i>=0; --i) e.code(c>>i&1); e.count(); lzp->update(c); } e.flush(); } else { // DECOMPRESS int usize=get4(in); get4(in); // csize Encoder e(DECOMPRESS, in); while (usize--) { int c=lzp->c(); if (e.code()==0) { c=1; while (c<256) c+=c+e.code(); c&=255; } if (out) putc(c, out); lzp->update(c); } } } ///////////////////////////// store /////////////////////////// // Store a file in blocks as: {'\0' mode usize csize contents}... void store(FILE* in, FILE* out) { assert(in); assert(out); // Store in blocks const int BLOCKSIZE=0x100000; static char* buf=0; if (!buf) alloc(buf, BLOCKSIZE); bool first=true; while (true) { int n=fread(buf, 1, BLOCKSIZE, in); if (!first && n<=0) break; fprintf(out, "%c%c", 0, 's'); put4(n, out); // usize put4(n, out); // csize fwrite(buf, 1, n, out); first=false; } // Close file fclose(in); } // Write usize == csize bytes of an uncompressed block from in to out void unstore(FILE* in, FILE* out) { assert(in); int usize=get4(in); int csize=get4(in); if (usize!=csize) printf("Bad archive format: usize=%d csize=%d\n", usize, csize); static char* buf=0; const int BUFSIZE=0x1000; if (!buf) alloc(buf, BUFSIZE); while (csize>0) { usize=csize; if (usize>BUFSIZE) usize=BUFSIZE; if (int(fread(buf, 1, usize, in))!=usize) printf("Unexpected end of archive\n"), exit(1); if (out) fwrite(buf, 1, usize, out); csize-=usize; } } //////////////////////// Archiving functions //////////////////////// const int MAXNAMELEN=1023; // max filename length // Return true if the first 4 bytes of in are a valid archive bool check_archive(FILE* in) { return getc(in)=='p' && getc(in)=='Q' && getc(in)=='9' && getc(in)==1; } // Open archive and check for valid archive header, exit if bad. // Set MEM to memory option '1' through '9' FILE* open_archive(const char* filename) { FILE* in=fopen(filename, "rb"); if (!in) printf("Cannot find archive %s\n", filename), exit(1); if (!check_archive(in) || (MEM=getc(in))<'1' || MEM>'9') { fclose(in); printf("%s: Not a paq9a archive\n", filename); exit(1); } return in; } // Compress filename to out. option is 'c' to compress or 's' to store. void compress(const char* filename, FILE* out, int option) { // Open input file FILE* in=fopen(filename, "rb"); if (!in) { printf("File not found: %s\n", filename); return; } fprintf(out, "%s", filename); printf("%-40s ", filename); // Compress depending on option if (option=='s') store(in, out); else if (option=='c') paq9a(in, out, COMPRESS); printf("\n"); } // List archive contents void list(const char* archive) { double usum=0, csum=0; // uncompressed and compressed size per file double utotal=0, ctotal=4; // total size in archive static char filename[MAXNAMELEN+1]; int mode=0; FILE* in=open_archive(archive); printf("\npaq9a -%c\n", MEM); while (true) { // Get filename, mode int c=getc(in); if (c==EOF) break; if (c) { // start of new file? Print previous file if (mode) printf("%10.0f -> %10.0f %c %s\n", usum, csum, mode, filename); int len=0; filename[len++]=c; while ((c=getc(in))!=EOF && c) if (len<MAXNAMELEN) filename[len++]=c; filename[len]=0; utotal+=usum; ctotal+=csum; usum=0; csum=len; } // Get uncompressed size mode=getc(in); int usize=get4(in); usum+=usize; // Get compressed size int csize=get4(in); csum+=csize+10; if (usize<0 || csize<0 || mode!='c' && mode!='s') printf("Archive corrupted usize=%d csize=%d mode=%d at %ld\n", usize, csize, mode, ftell(in)), exit(1); // Skip csize bytes const int BUFSIZE=0x1000; char buf[BUFSIZE]; while (csize>BUFSIZE) csize-=fread(buf, 1, BUFSIZE, in); fread(buf, 1, csize, in); } printf("%10.0f -> %10.0f %c %s\n", usum, csum, mode, filename); utotal+=usum; ctotal+=csum; printf("%10.0f -> %10.0f total\n", utotal, ctotal); fclose(in); } // Extract files given command line arguments // Input format is: [filename {'\0' mode usize csize contents}...]... void extract(int argc, char** argv) { assert(argc>2); assert(argv[1][0]=='x'); static char filename[MAXNAMELEN+1]; // filename from archive // Open archive FILE* in=open_archive(argv[2]); MEM=1<<22+MEM-'0'; // Extract files argc-=3; argv+=3; FILE* out=0; while (true) { // for each block // Get filename int c; for (int i=0;; ++i) { c=getc(in); if (c==EOF) break; if (i<MAXNAMELEN) filename[i]=c; if (!c) break; } if (c==EOF) break; // Open output file if (filename[0]) { // new file? const char* fn=filename; if (argc>0) fn=argv[0], --argc, ++argv; if (out) fclose(out); out=fopen(fn, "rb"); if (out) { printf("\nCannot overwrite file, skipping: %s ", fn); fclose(out); out=0; } else { out=fopen(fn, "wb"); if (!out) printf("\nCannot create file: %s ", fn); } if (out) { if (fn==filename) printf("\n%s ", filename); else printf("\n%s -> %s ", filename, fn); } } // Extract block int mode=getc(in); if (mode=='s') unstore(in, out); else if (mode=='c') paq9a(in, out, DECOMPRESS); else printf("\nUnsupported compression mode %c %d at %ld\n", mode, mode, ftell(in)), exit(1); } printf("\n"); if (out) fclose(out); } // Command line is: paq9a {a|x|l} archive [[-option] files...]... int main(int argc, char** argv) { clock_t start=clock(); // Check command line arguments if (argc<3 || argv[1][1] || (argv[1][0]!='a' && argv[1][0]!='x' && argv[1][0]!='l') || (argv[1][0]=='a' && argc<4) || argv[2][0]=='-') { printf("paq9a archiver (C) 2007, Matt Mahoney\n" "Free software under GPL, http://www.gnu.org/copyleft/gpl.html\n" "\n" "To create archive: paq9a a archive [-1..-9] [[-s|-c] files...]...\n" " -1..-9 = use 18 to 1585 MiB memory (default -7 = 408 MiB)\n" " -s = store, -c = compress (default)\n" "To extract files: paq9a x archive [files...]\n" "To list contents: paq9a l archive\n"); exit(1); } // Create archive if (argv[1][0]=='a') { int option = 'c'; // -c or -s FILE* out=fopen(argv[2], "rb"); if (out) printf("Cannot overwrite archive %s\n", argv[2]), exit(1); out=fopen(argv[2], "wb"); if (!out) printf("Cannot create archive %s\n", argv[2]), exit(1); fprintf(out, "pQ9%c", 1); int i=3; if (argc>3 && argv[3][0]=='-' && argv[3][1]>='1' && argv[3][1]<='9' && argv[3][2]==0) { putc(argv[3][1], out); MEM=1<<22+argv[3][1]-'0'; ++i; } else putc('7', out); for (; i<argc; ++i) { if (argv[i][0]=='-' && (argv[i][1]=='c' || argv[i][1]=='s') && argv[i][2]==0) option=argv[i][1]; else compress(argv[i], out, option); } printf("-> %ld in %1.2f sec\n", ftell(out), double(clock()-start)/CLOCKS_PER_SEC); } // List archive contents else if (argv[1][0]=='l') list(argv[2]); // Extract from archive else if (argv[1][0]=='x') { extract(argc, argv); printf("%1.2f sec\n", double(clock()-start)/CLOCKS_PER_SEC); } // Report statistics delete predictor; delete lzp; printf("Used %d KiB memory\n", allocated>>10); return 0; }