Compression and decompression
Oct 27, 2015 21:15:15 GMT 1
Post by Pjot on Oct 27, 2015 21:15:15 GMT 1
It would be fun to have native compression in BaCon using an original algorithm. I tried to implement a simple compression technique for text files:
(1) find all words in the file and count their occurrence, put this information in a list
(2) sort the list, and identify the most occurring words with a unique number, store this in a table
(3) write this table, then the data of the original file, replacing the words from the table with their number.
For decompression it is the other way around of course.
I made a Proof of Concept (POC) to implement this idea, just to see how well it compresses. Typically, compression ratio for BaCon source code programs lies between 65% and 75%. Not bad for a first attempt, but definitely not good enough
Below the compression and decompression programs. They don't deserve a price for the nicest code, but it's just a POC like I mentioned.
BR
Peter
COMPRESSION
DECOMPRESS
(1) find all words in the file and count their occurrence, put this information in a list
(2) sort the list, and identify the most occurring words with a unique number, store this in a table
(3) write this table, then the data of the original file, replacing the words from the table with their number.
For decompression it is the other way around of course.
I made a Proof of Concept (POC) to implement this idea, just to see how well it compresses. Typically, compression ratio for BaCon source code programs lies between 65% and 75%. Not bad for a first attempt, but definitely not good enough
Below the compression and decompression programs. They don't deserve a price for the nicest code, but it's just a POC like I mentioned.
BR
Peter
COMPRESSION
'
' Proof-of-Concept compression for plain text files ASCII 0-127
'
' (1) Find unique words a-zA-Z and count their frequency
' (2) Replace their occurrence with a number>127 (bit7 set)
'
' Created by PvE 2015 - GPL v3.
'---------------------------------------------------------------------
OPTION MEMSTREAM TRUE
OPTION MEMTYPE unsigned char
IF INSTR(ARGUMENT$, " ") THEN
file$ = MID$(ARGUMENT$, INSTR(ARGUMENT$, " ")+1)
IF NOT(FILEEXISTS(file$)) THEN
PRINT "Cannot find file '", file$, "'! Exiting..."
END 1
FI
ELSE
PRINT "Usage: ", ARGUMENT$, " <file.txt>"
END 1
FI
CONST WordListSize = 10000
DECLARE word ASSOC int
tt = TIMER
FileSize = FILELEN(file$)
FileData = MEMORY(FileSize)
PRINT "Opening file..."
' Read file into memory area
OPEN file$ FOR READING AS myfile
GETBYTE FileData FROM myfile SIZE FileSize
CLOSE FILE myfile
' DYnamic array to store all words
amount_of_words = WordListSize
DECLARE word$ ARRAY amount_of_words
OPEN FileData FOR MEMORY AS dat$
PRINT "Analyzing data..."
length = 0
ctr = 0
FOR x = 0 TO FileSize-1
IF (PEEK(FileData+x) & 128) THEN
PRINT "Binary data found! Is this a UTF-8 file? Exiting..."
END 1
ENDIF
IF PEEK(FileData+x) > 64 AND PEEK(FileData+x) < 91 THEN
INCR length
ELIF PEEK(FileData+x) > 96 AND PEEK(FileData+x) < 123 THEN
INCR length
ELSE
IF length > 1 THEN
word$[ctr] = MID$(dat$, x+1-length, length)
INCR ctr
IF ctr >= amount_of_words THEN
amount_of_words = 2*amount_of_words
REDIM word$ TO amount_of_words
PRINT "Resizing word list..."
FI
ENDIF
length = 0
FI
NEXT
' This assoc array contains the unique words
DECLARE uniq_word ASSOC long
FOR x = 0 TO ctr-1
INCR uniq_word(word$[x])
NEXT
' We multiply the frequency with the length
LOOKUP uniq_word TO t$ SIZE amount
FOR x = 0 TO amount-1
IF LEN(t$[x]) > 1 THEN uniq_word(t$[x]) = uniq_word(t$[x]) * LEN(t$[x])
NEXT
' Sort and create final array
SORT uniq_word DOWN
LOOKUP uniq_word TO s$ SIZE amount
'-------------------------------------------------------------
PRINT "Amount of words: ", ctr
PRINT "Amount of unique words: ", amount
'PRINT "-----------------------------------"
'PRINT "Words most used: "
'FOR x = 0 TO 127
' PRINT x, " : ", s$[x], " : ", uniq_word(s$[x])
'NEXT
'PRINT "-----------------------------------"
'-------------------------------------------------------------
IdxCtr = MEMORY(2)
POKE IdxCtr, 128
' Correct index for array
DECR amount
' Not more than 127 indexes
IF amount > 127 THEN amount = 127
limit = uniq_word(s$[amount])
DECLARE idx_word ASSOC int
newFile$ = MID$(file$, 1, INSTRREV(file$, ".")) & "baz"
OPEN newFile$ FOR WRITING AS output
PRINT "Writing index table..."
' First store original filename
PUTBYTE file$ TO output SIZE LEN(file$)
' Write table with index number and word
FOR x = 0 TO amount
PUTBYTE IdxCtr TO output SIZE 1
PUTBYTE s$[x] TO output SIZE LEN(s$[x])
idx_word(s$[x]) = PEEK(IdxCtr)
POKE IdxCtr, PEEK(IdxCtr)+1
NEXT
' Marker to indicate data starts here
POKE IdxCtr, 255
PUTBYTE IdxCtr TO output SIZE 1
PUTBYTE IdxCtr TO output SIZE 1
PRINT "Writing data..."
length = 0
FOR x = 0 TO FileSize-1
IF PEEK(FileData+x) > 64 AND PEEK(FileData+x) < 91 THEN
INCR length
ELIF PEEK(FileData+x) > 96 AND PEEK(FileData+x) < 123 THEN
INCR length
ELSE
' If the word is longer than 1 character
IF length > 1 THEN
cur$ = MID$(dat$, x+1-length, length)
IF uniq_word(cur$) > limit THEN
POKE IdxCtr, idx_word(cur$)
PUTBYTE IdxCtr TO output SIZE 1
ELSE
PUTBYTE cur$ TO output SIZE LEN(cur$)
FI
POKE IdxCtr, PEEK(FileData+x)
PUTBYTE IdxCtr TO output SIZE 1
' Original bytes are written unmodified
ELSE
IF length = 1 THEN
POKE IdxCtr, PEEK(FileData+x-1)
POKE IdxCtr+1, PEEK(FileData+x)
ELSE
POKE IdxCtr, PEEK(FileData+x)
FI
PUTBYTE IdxCtr TO output SIZE length+1
ENDIF
length = 0
FI
NEXT
CLOSE FILE output
CLOSE MEMORY dat$
FREE IdxCtr, FileData
PRINT "All done. Time taken (msecs): ", TIMER-tt
PRINT "Original file size: ", FileSize
PRINT "Compressed file size: ", FILELEN(newFile$)
PRINT "Compression ratio: ", FILELEN(newFile$)*100/FileSize, "%."
END
DECOMPRESS
'
' Proof-of-Concept decompression for plain text files ASCII 0-127
'
' (1) Recreate table from beginning
' (2) Replace bytes with a value > 127 by the corresponding term in the table
'
' Created by PvE 2015 - GPL v3.
'---------------------------------------------------------------------
OPTION MEMSTREAM TRUE
OPTION MEMTYPE unsigned char
IF INSTR(ARGUMENT$, " ") THEN
file$ = MID$(ARGUMENT$, INSTR(ARGUMENT$, " ")+1)
IF NOT(FILEEXISTS(file$)) THEN
PRINT "Cannot find file '", file$, "'! Exiting..."
END 1
FI
ELSE
PRINT "Usage: ", ARGUMENT$, " <file.txt>"
END 1
FI
tt = TIMER
FileSize = FILELEN(file$)
FileData = MEMORY(FileSize)
PRINT "Opening file..."
OPEN file$ FOR READING AS myfile
GETBYTE FileData FROM myfile SIZE FileSize
CLOSE FILE myfile
DECLARE word$[256]
PRINT "Constructing index table..."
' Get filename
x = 0
WHILE TRUE
IF PEEK(FileData+x) < 128 THEN
out$ = out$ & CHR$(PEEK(FileData+x))
ELSE
BREAK
ENDIF
INCR x
WEND
IF FILEEXISTS(out$) THEN INPUT "Filename '", out$, "' exists! Overwrite (y/n)? ", answer$
IF answer$ <> "y" THEN END
OPEN out$ FOR WRITING AS uncompressed
' Build table
REPEAT
p = PEEK(FileData+x)
' Reaching end marker
IF p = 255 AND PEEK(FileData+x+1) = 255 THEN BREAK
' Index number?
IF p >= 128 THEN
WHILE TRUE
INCR x
IF PEEK(FileData+x) < 128 THEN
word$[p] = word$[p] & CHR$(PEEK(FileData+x))
ELSE
BREAK
ENDIF
WEND
ELSE
INCR x
ENDIF
UNTIL x >= FileSize-1
INCR x, 2
IdxCtr = MEMORY(1)
PRINT "Writing uncompressed data..."
' Write file
REPEAT
IF PEEK(FileData+x) < 128 THEN
POKE IdxCtr, PEEK(FileData+x)
PUTBYTE IdxCtr TO uncompressed SIZE 1
ELSE
txt$ = word$[PEEK(FileData+x)]
PUTBYTE txt$ TO uncompressed SIZE LEN(txt$)
FI
INCR x
UNTIL x >= FileSize
FREE IdxCtr, FileData
CLOSE FILE uncompressed
PRINT "All done. Time taken (msecs): ", TIMER-tt
END