// Profi-maX implementation // http://www.csse.monash.edu.au/cluster/RJK/Compress/problem.html // http://www.csse.monash.edu.au/cluster/RJK/Compress/bpe.c //* bpe.c - rewritten to handle parameterised command line input */ //* from compress.c */ //* Copyright Philip Gage */ //* printed in 'The C Users Journal' February, 1994 */ //https://github.com/vteromero/byte-pair-encoding unit MainCompressUnit; interface uses Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms, Dialogs, StdCtrls, Grids, ValEdit; type TForm1 = class(TForm) Button1: TButton; OpenDialog1: TOpenDialog; Values1: TValueListEditor; Values2: TValueListEditor; Button2: TButton; procedure Button2Click(Sender: TObject); procedure Button1Click(Sender: TObject); private { Private declarations } public { Public declarations } end; var Form1: TForm1; implementation {$R *.dfm} const BLOCKSIZE = 10000; //* maximum block size */ HASHSIZE = 8192; //* size of hash table */ MAXCHARS = 220; //* char set per block */ THRESHOLD = 3; //* minimum pair count */ var buffer : array of BYTE;// array [0..BLOCKSIZE - 1] of BYTE; //* data block */ leftcode: array [0..255] of BYTE; //* pair table */ rightcode: array [0..255] of BYTE; //* pair table */ leftArray: array of BYTE;// array [0..HASHSIZE - 1] of BYTE; //* hash table */ rightArray: array of BYTE;// array [0..HASHSIZE - 1] of BYTE; //* hash table */ count: array of BYTE;// array [0..HASHSIZE - 1] of BYTE; //* pair count */ size: Integer; //* size of current data block */ //==================================================================================================================== function SED(Value: DWORD): DWORD {integer}; register; asm bswap eax end; //============================================================================= function SEW(Value: WORD {smallint}): WORD {smallint}; register; asm xchg al, ah end; //============================================================================= //* return index of character pair in hash table */ //* deleted nodes have a count of 1 for hashing */ //int lookup (unsigned char a, unsigned char b, int hs) function LN_lookup(a, b: BYTE; hs: Integer): Integer; begin //* compute hash key from both characters */ Result := ( a xor (b shl 5)) and (hs - 1); //* search for pair or first empty slot */ while ((leftArray[Result] <> a) or (rightArray[Result] <> b)) and (count[Result] <> 0) do Result := (Result + 1) and (hs - 1); leftArray[Result] := a; rightArray[Result] := b; end; //==================================================================================================================== //* read next block from InStream file into buffer */ function LN_fileread (InStream: TStream; bs, hs, mc: Integer): boolean; var c, index, used: Integer; bBuf: BYTE; begin used := 0; //* reset hash table and pair table */ for c := 0 to hs - 1 do count[c] := 0; for c := 0 to 255 do begin leftcode[c] := c; rightcode[c] := 0; end; size := 0; //* read data until full or few unused chars */ //while (size < bs && used < mc && (c = getc(InStream)) != EOF) while (size < bs) and (used < mc) and (InStream.Position < InStream.Size) do begin InStream.Read(bBuf, 1); c := bBuf; if size > 0 then begin index := LN_lookup(buffer[size-1], c, hs); if (count[index] < 255) then begin Inc(count[index]); end; end; buffer[size] := c; Inc(size); //* use right code to flag data chars found */ if rightcode[c] = 0 then begin rightcode[c] := 1; Inc(used); end; end; if used >= mc then Form1.Caption := 'used >= mc'; Result := InStream.Position = InStream.Size; end; //==================================================================================================================== //* write each pair table and data block to OutStream */ function LN_filewrite(OutStream: TStream): DWORD; var i, len, c: Integer; begin c := 0; //* for each character 0..255 */ Result := OutStream.Position; while c < 256 do begin //* if not a pair code, count run of literals */ if c = leftcode[c] then begin len := 1; Inc(c); while (len < 127) and (c < 256) and (c = leftcode[c]) do begin Inc(len); Inc(c); end; len := len + 127; OutStream.Write(len, 1); len := 0; if c = 256 then BREAK; end //* else count run of pair codes */ else begin len := 0; Inc(c); while ((len < 127) and (c < 256) and (c <> leftcode[c])) or ((len < 125) and (c < 254) and ((c+1) <> leftcode[c+1])) do begin Inc(len); Inc(c); end; OutStream.Write(len, 1); c := c - (len + 1); end; //* write range of pairs to OutStream */ i := 0; while i <= len do begin OutStream.Write(leftcode[c], 1); if c <> leftcode[c] then OutStream.Write(rightcode[c], 1); Inc(c); Inc(i); end; end; // Form1.Values1.Strings.Add('Size=' + IntToHex(size, 4)); // Form1.Values1.Strings.Add(''); len := size div 256; OutStream.Write(len, 1); len := size - (len * 256); OutStream.Write(len, 1); OutStream.Write(buffer[0], size); Result := OutStream.Position - Result; end; //==================================================================================================================== //* compress from InStream file to OutStream file */ function compressBlock(infile, outfile : TStream; bs, hs, mc, th: Integer): DWORD; var leftch, rightch, code, oldsize: Integer; index, r, w, best: Integer; done: boolean; KeysCount: Integer; begin done := FALSE; Result := 0; outfile.Position := outfile.Size; //* compress each data block until end of file */ while not done do begin done := LN_fileread(infile, bs, hs, mc); code := 256; KeysCount := 0; //* compress this block */ while True do begin //* get next unused chr for pair code */ Dec(code); while code >= 0 do begin if ( code = leftcode[code]) and (rightcode[code] = 0) then BREAK; Dec(code); end; //* must quit if no unused chars left */ if ( code < 127 ) then BREAK; //* find most frequent pair of chars */ best := 2; for index := 0 to hs - 1 do if (count[index] > best) then begin best := count[index]; leftch := leftArray[index]; rightch := rightArray[index]; end; //* done if no more compression possible */ if ( best < th ) then BREAK; //* Replace pairs in data, adjust pair counts */ oldsize := size - 1; w := 0; r := 0; while r < oldsize do begin if (buffer[r] = leftch) and ( buffer[r+1] = rightch) then begin if ( r > 0 ) then begin index := LN_lookup(buffer[w-1], leftch, hs); if ( count[index] > 1 ) then Dec(count[index]); index := LN_lookup( buffer[w-1], code, hs ); if ( count[index] < 255 ) then Inc(count[index]); end; if ( r < oldsize - 1 ) then begin index := LN_lookup( rightch, buffer[r+2] , hs); if ( count[index] > 1 ) then Dec(count[index]); index := LN_lookup( code, buffer[r+2], hs ); if ( count[index] < 255 ) then Inc(count[index]); end; buffer[w] := code; Inc(w); Inc(r); Dec(size); end else begin buffer[w] := buffer[r]; Inc(w); end; Inc(r); end; buffer[w] := buffer[r]; //* add to pair substitution table */ leftcode[code] := leftch; rightcode[code] := rightch; //* delete pair from hash table */ index := LN_lookup( leftch, rightch, hs ); count[index] := 1; // Form1.Values1.Strings.Add(IntToHex(code, 2) + '=' + IntToHex(leftch, 2) + IntToHex(rightch, 2)); Inc(KeysCount); end; Result := Result + LN_filewrite( outfile ); // Form1.Values1.Strings.Add('KeysCount=' + IntToStr(KeysCount)); end; oldsize := outfile.Size; if (outfile.Size and $7FF) > 0 then oldsize := (oldsize and $FFFFF800) + $800; outfile.Size := oldsize; end; //==================================================================================================================== type TLN_VocRec = record Key1, Key2: BYTE; IsKey: boolean; end; function ReadRLE_Vocabulare(aInFile: TFileStream): WORD; var aByte: BYTE; i: Integer; Index, StepCount: WORD; fVoc: array [0..255] of TLN_VocRec; KeysCount: Integer; //-------------------------------------- procedure ReadKey; var aVal: BYTE; begin if Index > Length(fVoc) then Exit; aInFile.Read(aVal, 1); if aVal <> Index then begin fVoc[Index].IsKey := TRUE; fVoc[Index].Key1 := aVal; aInFile.Read(fVoc[Index].Key2, 1); end; Inc(Index); end; //-------------------------------------- begin Index := 0; StepCount := 0; Result := 0; KeysCount := 0; for i := 0 to 255 do fVoc[i].IsKey := FALSE; while (Index <= $FF) and (StepCount < $FF) do begin Inc(StepCount); aInFile.Read(aByte, 1); if aByte >= $80 then begin Index := Index + aByte - 127; if (Index <= $FF) then ReadKey; end else for i := 0 to aByte do ReadKey; end; if Index = 256 then aInFile.Read(Result, 2); Result := SEW(Result); for i := 255 downto 0 do if fVoc[i].IsKey then begin Form1.Values2.Strings.Add(IntToHex(i, 2) + '=' + IntToHex(fVoc[i].Key1, 2) + IntToHex(fVoc[i].Key2, 2)); Inc(KeysCount); end; Form1.Values2.Strings.Add('Size=' + IntToHex(Result, 4)); Form1.Values2.Strings.Add('KeysCount=' + IntToStr(KeysCount)); Form1.Values2.Strings.Add(''); end; //============================================================================= procedure TForm1.Button1Click(Sender: TObject); var infile, outfile, cmpfile: TFileStream; var bs, hs, mc, th: integer; var InPart, OutPart: TMemoryStream; inSize: INT64; partSize: Integer; BlockCount, i: Integer; BlockSize, BlockAddr, InfoAddr, dBuf: DWORD; fn: string; SizeList: TList; StartTime: Double; begin if OpenDialog1.Execute then begin StartTime := GetTime; infile := TFileStream.Create(OpenDialog1.FileName, fmOpenRead); outfile := TFileStream.Create(OpenDialog1.FileName +'.cmp', fmCreate); cmpfile := TFileStream.Create(ExtractFilePath(OpenDialog1.FileName) +'05.CMP', fmCreate); InPart := TMemoryStream.Create; OutPart := TMemoryStream.Create; SizeList := TList.Create; infile.Position := 0; BlockCount := 0; inSize := inFile.Size; outfile.Position := 0; bs := 20000; //* maxval */ hs := 16384; //* maxval */ mc := 168; //* default value */ th := 3; //* default min */ SetLength(buffer, bs); SetLength(leftArray, hs); SetLength(rightArray, hs); SetLength(count, hs); dBuf := SED(1); cmpfile.Write(dbuf,4); infoAddr := 24; dBuf := SED(infoAddr); cmpfile.Write(dbuf,4); dBuf := SEW(1); cmpfile.Write(dbuf,2); dBuf := SEW(11); cmpfile.Write(dbuf,2); fn := 'ALLDATA.KWI'; cmpfile.Write(fn[1], 11); cmpfile.Position := InfoAddr; fn := 'BCMP'; cmpfile.Write(fn[1], 4); cmpfile.Position := InfoAddr + 8; dBuf := SED(infile.Size); cmpfile.Write(dbuf,4); while insize > 0 do begin inPart.Position := 0; partSize := $10000; if (inFile.Position + partSize) > infile.Size then partSize := infile.Size - inFile.Position; insize := insize - inPart.CopyFrom(inFile, partSize); inPart.Size := partSize; Inc(BlockCount); inPart.Position := 0; BlockAddr := outfile.size; dBuf := SED(BlockAddr div $800); cmpfile.Write(dbuf,4); OutPart.Clear; BlockSize := compressBlock( inPart, outPart, bs, hs, mc, th ); if BlockSize < $10000 then begin OutPart.Position := 0; outFile.CopyFrom(OutPart, OutPart.Size); dBuf := SEW(BlockSize); end else begin inPart.Position := 0; outFile.CopyFrom(InPart, InPart.Size); dBuf := 0; end; //cmpfile.Write(dbuf,2); //cmpfile.Seek(2, soCurrent); SizeList.Add(POINTER(dBuf)); end; dBuf := SED(BlockCount); cmpfile.Position := InfoAddr + 4; cmpfile.Write(dbuf,4); outfile.Position := 0; cmpfile.Position := InfoAddr + 12 + 4 * SizeList.Count; for i := 0 to SizeList.Count - 1 do begin dBuf := DWORD(SizeList[i]); cmpfile.Write(dBuf, 2); end; // ReadRLE_Vocabulare(outfile); inPart.Free; OutPart.Free; infile.Free; outfile.Free; cmpfile.Free; Form1.Caption := TimeToStr(GetTime - StartTime); end; end; //==================================================================================================================== procedure TForm1.Button2Click(Sender: TObject); var afile: TFileStream; begin afile := TFileStream.Create('E:\Users Data\Олег\Documents\Навигация\U33\Образ\IDX\argsr001.idx', fmOpenRead); ReadRLE_Vocabulare(afile); aFile.Free; end; end.