Post by Russell Marks[...]
Post by Tony Nicholsonhttps://github.com/agn453/UNZIP-CPM-Z80
[...]
Post by Russell MarksI have to say - maybe directing this more at Tony/Martin? - just
looking at the code generally my first thought on optimising for speed
Here we go again. :-) I thought a constant in the degzip_portable.c
that the deflate code is based on looked familiar, and it is - so I
ported over the table-based CRC code from that. Now this is
"expensive" as the table is 1k long and pushes the COM file slightly
past 5k, but extracting my deflate test zip is 32% quicker with this
(combined with my previous changes) as compared with UNZIP152. It
would be possible to construct the table at runtime of course, but on
a Z80 I imagine a precalculated table might be for the best.
(While I'm posting I may as well note that I ran the version *before*
this table-CRC change against every zip on the Walnut Creek CP/M CD
earlier on, with no CRC errors. Obviously the goal there was to check
the non-deflate code is still working ok, and it seems to be.)
Here's the overall patch against UNZIP152.Z80:
--------------- unzip152-rdbybits-and-crc32tab.diff ---------------
--- UNZIP152.Z80 2020-10-13 16:02:05.906861502 +0100
+++ unzip152-new3.z80 2020-10-15 02:27:13.342645445 +0100
@@ -150,6 +150,19 @@
cp '/'
jp z,usage
;
+; Check TPA size (this will need adjusting if warm-boot-only exit
+; is changed).
+;
+ ld hl,-128 ; allow for a decent stack size
+ add hl,sp
+ ld de,endaddr
+ or a
+ sbc hl,de ; check endaddr is less (i.e. hl is >=)
+ jr nc,wasfil
+ call ilprt
+ db 'Low mem',0 ; just a short error msg to give the idea
+ jp exit
+;
wasfil: ld de,altfcb
ld a,(de) ; output drive given?
ld (opfcb),a ; store it in output file control block
@@ -490,36 +503,62 @@
getcode:
ld a,(codesize)
readbits:
- ld hl,8000h
-bitlp: push af
- push hl
-getbit: ld hl,bleft
- dec (hl)
- jp m,readbt
- dec hl
- rr (hl)
- pop hl
- rr h
- rr l
- jr c,bitex
+ push bc ; may not need to save bc?
+ ld b,a
+ ld c,80h ; bits rotate into C and A
+ xor a ; (rra is 4 cycles vs 8 for others)
+ ld hl,(bitbuf) ; keep bitbuf in L, bleft in H
+getbit: dec h
+ jp p,getbt2 ; skip if new byte not needed yet
+ push af
+ push bc
+ call getbyte
+ ld l,a ; new bitbuf
+ ld h,7 ; 8 bits left, pre-dec'd
+ pop bc
pop af
- dec a
- jr nz,bitlp
-finbit: srl h
- rr l
- jr nc,finbit
- jr bitret
-bitex: pop af
-bitret: ld a,l
+getbt2: rr l
+ rr c
+ rra
+ jr c,bitret
+ djnz getbit
+finbit: srl c
+ rra
+ jp nc,finbit ; jp likely faster in this case
+bitret: ld (bitbuf),hl ; update bitbuf/bleft
+ ld h,c ; return bits in HL and A
+ ld l,a
+ pop bc
ret
;
-readbt: push hl
+; rdbybits - faster version of readbits for <=8 bits
+;
+rdbybits:
+ push bc ; may not need to save bc?
+ ld b,a
+ ld a,80h ; bits rotate into A (rra faster)
+ ld hl,(bitbuf) ; keep bitbuf in L, bleft in H
+rdbylp: dec h
+ jp p,rdby1 ; skip if new byte not needed yet
+ ld c,a
+ push bc
call getbyte
- pop hl
- ld (hl),8
- dec hl
- ld (hl),a
- jr getbit
+ ld l,a ; new bitbuf
+ ld h,7 ; 8 bits left, pre-dec'd
+ pop bc
+ ld a,c
+rdby1: rr l
+ rra
+ jr c,rdbyrt
+ djnz rdbylp
+ or a ; clear carry flag initially
+rdby2: rra ; safe as dropped bits are all zeroes
+ jp nc,rdby2 ; jp likely faster in this case
+rdbyrt: ld (bitbuf),hl ; update bitbuf/bleft
+ ld h,0 ; return bits in HL and A
+ ld l,a
+ pop bc
+ ret
;
scanfn: ld a,(de)
cp '.'
@@ -716,34 +755,37 @@
ld (hl),a
ret
;
-updcrc: ld hl,(crc32)
+; based on this from crc32() in degzip_portable.c:
+; for (i = 0; i < len; i++)
+; crc = crc32_tab[(uint8_t)(crc ^ *b++)] ^ (crc >> 8);
+;
+updcrc: ld bc,(crc32)
+ xor c ; A=low byte of crc xor output byte
+ ld h,0
+ ld l,a
+ add hl,hl ; *2
+ add hl,hl ; *4
+ ld de,crc32tab
+ add hl,de
ld de,(crc32 + 2)
+ ; now DEBC is "crc", and HL points to low byte of
+ ; relevant crc32tab entry. Do the xor with "crc"/256,
+ ; starting from the low bytes.
+ ld a,(hl)
+ xor b
ld c,a
- ld b,8
-crclp: ld a,l
- xor c
- srl c
- srl d
- rr e
- rr h
- rr l
- rra
- jr nc,noxor
- ld a,d
- xor 0edh
- ld d,a
- ld a,e
- xor 0b8h
+ inc hl
+ ld a,(hl)
+ xor e
+ ld b,a
+ inc hl
+ ld a,(hl)
+ xor d
ld e,a
- ld a,h
- xor 83h
- ld h,a
- ld a,l
- xor 20h
- ld l,a
-noxor: djnz crclp
- ld (crc32),hl
+ inc hl
+ ld d,(hl) ; high byte is a simple copy
ld (crc32 + 2),de
+ ld (crc32),bc
ret
;
unshrink:
@@ -984,7 +1026,7 @@
lflp: push bc
push hl
ld a,6
- call readbits
+ call rdbybits
pop hl
pop de
ld (hl),a
@@ -999,7 +1041,7 @@
ldfllp: push hl
push bc
ld a,8
- call readbits
+ call rdbybits
pop bc
pop hl
ld (hl),a
@@ -1035,11 +1077,11 @@
or a
jr nz,ur2
ur4: ld a,8
- call readbits
+ call rdbybits
jr ur3
;
ur2: ld a,1
- call readbits
+ call rdbybits
dec l
jr z,ur4
call slenlch
@@ -1073,7 +1115,7 @@
ld a,l
or a
jr z,ur10
- ld (V),a
+ ld (urV),a
ld a,(L_table)
ld h,a
and l
@@ -1106,7 +1148,7 @@
jr nz,ur13
ld a,(D_shift)
ld b,a
- ld a,(V)
+ ld a,(urV)
ur14: srl a
djnz ur14
ld h,a
@@ -1191,7 +1233,7 @@
;
readlengths:
ld a,8
- call readbits
+ call rdbybits
ld d,h
ld e,d
inc hl
@@ -1211,11 +1253,11 @@
push de
push hl
ld a,4
- call readbits
+ call rdbybits
inc a
push af
ld a,4
- call readbits
+ call rdbybits
inc a
ld b,a
pop af
@@ -1412,7 +1454,7 @@
push de
push bc
ld a,1
- call readbits
+ call rdbybits
pop af
push af
or a
@@ -1487,7 +1529,7 @@
jr ui4
;
ui3: ld a,8
- call readbits
+ call rdbybits
ui4: call outb
jr ui1
;
@@ -1512,7 +1554,7 @@
jr nz,ui6
push hl
ld a,8
- call readbits
+ call rdbybits
pop de
add hl,de
ui6: ld de,(mml)
@@ -1529,7 +1571,7 @@
ld (treep),hl
nsloop: push hl
ld a,1
- call readbits
+ call rdbybits
pop hl
or a
jr z,nsleft
@@ -1730,19 +1772,19 @@
;
huffman:
ld a,5
- call readbits
+ call rdbybits
inc a
ld l,a
ld h,1
ld (hlit),hl
ld a,5
- call readbits
+ call rdbybits
inc a
ld (hdist),a
ld a,4
- call readbits
+ call rdbybits
add a,4
ld c,a
@@ -1754,7 +1796,7 @@
push bc
push de
ld a,3
- call readbits
+ call rdbybits
pop hl
ld c,(hl)
ld b,0
@@ -1805,7 +1847,7 @@
cp 010h
jr nz,hmn16
ld a,2
- call readbits
+ call rdbybits
pop hl
pop bc
add a,3
@@ -1823,7 +1865,7 @@
hmn16: cp 011h
jr nz,hmn17
ld a,3
- call readbits
+ call rdbybits
pop hl
pop bc
add a,3
@@ -1839,7 +1881,7 @@
hmn17: cp 012h
jr nz,hmn18
ld a,7
- call readbits
+ call rdbybits
pop hl
pop bc
add a,11
@@ -1965,11 +2007,11 @@
ret nz
ld a,1
- call readbits
+ call rdbybits
push af
ld a,2
- call readbits
+ call rdbybits
or a
jr nz,udnt0
@@ -2125,10 +2167,10 @@
counting:
db 0
init:
- db 0
- db 0
- dw 0,0
- dw -1,-1
+ db 0 ; for bleft
+ db 0 ; for wrtpt
+ dw 0,0 ; for outpos
+ dw -1,-1 ; for crc32
endinit:
inbufp: dw 0080h
readpt: db 80h
@@ -2211,6 +2253,135 @@
db 06dh, 0dbh, 0b6h, 06dh, 0dbh, 0b6h, 0cdh, 0dbh
db 0b6h, 06dh, 0dbh, 0b6h, 06dh, 0dbh, 0a8h, 06dh
db 0ceh, 08bh, 06dh, 03bh
+crc32tab: ; crc32_tab[] from degzip_portable.c, takes 1k
+ db 000h,000h,000h,000h,096h,030h,007h,077h
+ db 02Ch,061h,00Eh,0EEh,0BAh,051h,009h,099h
+ db 019h,0C4h,06Dh,007h,08Fh,0F4h,06Ah,070h
+ db 035h,0A5h,063h,0E9h,0A3h,095h,064h,09Eh
+ db 032h,088h,0DBh,00Eh,0A4h,0B8h,0DCh,079h
+ db 01Eh,0E9h,0D5h,0E0h,088h,0D9h,0D2h,097h
+ db 02Bh,04Ch,0B6h,009h,0BDh,07Ch,0B1h,07Eh
+ db 007h,02Dh,0B8h,0E7h,091h,01Dh,0BFh,090h
+ db 064h,010h,0B7h,01Dh,0F2h,020h,0B0h,06Ah
+ db 048h,071h,0B9h,0F3h,0DEh,041h,0BEh,084h
+ db 07Dh,0D4h,0DAh,01Ah,0EBh,0E4h,0DDh,06Dh
+ db 051h,0B5h,0D4h,0F4h,0C7h,085h,0D3h,083h
+ db 056h,098h,06Ch,013h,0C0h,0A8h,06Bh,064h
+ db 07Ah,0F9h,062h,0FDh,0ECh,0C9h,065h,08Ah
+ db 04Fh,05Ch,001h,014h,0D9h,06Ch,006h,063h
+ db 063h,03Dh,00Fh,0FAh,0F5h,00Dh,008h,08Dh
+ db 0C8h,020h,06Eh,03Bh,05Eh,010h,069h,04Ch
+ db 0E4h,041h,060h,0D5h,072h,071h,067h,0A2h
+ db 0D1h,0E4h,003h,03Ch,047h,0D4h,004h,04Bh
+ db 0FDh,085h,00Dh,0D2h,06Bh,0B5h,00Ah,0A5h
+ db 0FAh,0A8h,0B5h,035h,06Ch,098h,0B2h,042h
+ db 0D6h,0C9h,0BBh,0DBh,040h,0F9h,0BCh,0ACh
+ db 0E3h,06Ch,0D8h,032h,075h,05Ch,0DFh,045h
+ db 0CFh,00Dh,0D6h,0DCh,059h,03Dh,0D1h,0ABh
+ db 0ACh,030h,0D9h,026h,03Ah,000h,0DEh,051h
+ db 080h,051h,0D7h,0C8h,016h,061h,0D0h,0BFh
+ db 0B5h,0F4h,0B4h,021h,023h,0C4h,0B3h,056h
+ db 099h,095h,0BAh,0CFh,00Fh,0A5h,0BDh,0B8h
+ db 09Eh,0B8h,002h,028h,008h,088h,005h,05Fh
+ db 0B2h,0D9h,00Ch,0C6h,024h,0E9h,00Bh,0B1h
+ db 087h,07Ch,06Fh,02Fh,011h,04Ch,068h,058h
+ db 0ABh,01Dh,061h,0C1h,03Dh,02Dh,066h,0B6h
+ db 090h,041h,0DCh,076h,006h,071h,0DBh,001h
+ db 0BCh,020h,0D2h,098h,02Ah,010h,0D5h,0EFh
+ db 089h,085h,0B1h,071h,01Fh,0B5h,0B6h,006h
+ db 0A5h,0E4h,0BFh,09Fh,033h,0D4h,0B8h,0E8h
+ db 0A2h,0C9h,007h,078h,034h,0F9h,000h,00Fh
+ db 08Eh,0A8h,009h,096h,018h,098h,00Eh,0E1h
+ db 0BBh,00Dh,06Ah,07Fh,02Dh,03Dh,06Dh,008h
+ db 097h,06Ch,064h,091h,001h,05Ch,063h,0E6h
+ db 0F4h,051h,06Bh,06Bh,062h,061h,06Ch,01Ch
+ db 0D8h,030h,065h,085h,04Eh,000h,062h,0F2h
+ db 0EDh,095h,006h,06Ch,07Bh,0A5h,001h,01Bh
+ db 0C1h,0F4h,008h,082h,057h,0C4h,00Fh,0F5h
+ db 0C6h,0D9h,0B0h,065h,050h,0E9h,0B7h,012h
+ db 0EAh,0B8h,0BEh,08Bh,07Ch,088h,0B9h,0FCh
+ db 0DFh,01Dh,0DDh,062h,049h,02Dh,0DAh,015h
+ db 0F3h,07Ch,0D3h,08Ch,065h,04Ch,0D4h,0FBh
+ db 058h,061h,0B2h,04Dh,0CEh,051h,0B5h,03Ah
+ db 074h,000h,0BCh,0A3h,0E2h,030h,0BBh,0D4h
+ db 041h,0A5h,0DFh,04Ah,0D7h,095h,0D8h,03Dh
+ db 06Dh,0C4h,0D1h,0A4h,0FBh,0F4h,0D6h,0D3h
+ db 06Ah,0E9h,069h,043h,0FCh,0D9h,06Eh,034h
+ db 046h,088h,067h,0ADh,0D0h,0B8h,060h,0DAh
+ db 073h,02Dh,004h,044h,0E5h,01Dh,003h,033h
+ db 05Fh,04Ch,00Ah,0AAh,0C9h,07Ch,00Dh,0DDh
+ db 03Ch,071h,005h,050h,0AAh,041h,002h,027h
+ db 010h,010h,00Bh,0BEh,086h,020h,00Ch,0C9h
+ db 025h,0B5h,068h,057h,0B3h,085h,06Fh,020h
+ db 009h,0D4h,066h,0B9h,09Fh,0E4h,061h,0CEh
+ db 00Eh,0F9h,0DEh,05Eh,098h,0C9h,0D9h,029h
+ db 022h,098h,0D0h,0B0h,0B4h,0A8h,0D7h,0C7h
+ db 017h,03Dh,0B3h,059h,081h,00Dh,0B4h,02Eh
+ db 03Bh,05Ch,0BDh,0B7h,0ADh,06Ch,0BAh,0C0h
+ db 020h,083h,0B8h,0EDh,0B6h,0B3h,0BFh,09Ah
+ db 00Ch,0E2h,0B6h,003h,09Ah,0D2h,0B1h,074h
+ db 039h,047h,0D5h,0EAh,0AFh,077h,0D2h,09Dh
+ db 015h,026h,0DBh,004h,083h,016h,0DCh,073h
+ db 012h,00Bh,063h,0E3h,084h,03Bh,064h,094h
+ db 03Eh,06Ah,06Dh,00Dh,0A8h,05Ah,06Ah,07Ah
+ db 00Bh,0CFh,00Eh,0E4h,09Dh,0FFh,009h,093h
+ db 027h,0AEh,000h,00Ah,0B1h,09Eh,007h,07Dh
+ db 044h,093h,00Fh,0F0h,0D2h,0A3h,008h,087h
+ db 068h,0F2h,001h,01Eh,0FEh,0C2h,006h,069h
+ db 05Dh,057h,062h,0F7h,0CBh,067h,065h,080h
+ db 071h,036h,06Ch,019h,0E7h,006h,06Bh,06Eh
+ db 076h,01Bh,0D4h,0FEh,0E0h,02Bh,0D3h,089h
+ db 05Ah,07Ah,0DAh,010h,0CCh,04Ah,0DDh,067h
+ db 06Fh,0DFh,0B9h,0F9h,0F9h,0EFh,0BEh,08Eh
+ db 043h,0BEh,0B7h,017h,0D5h,08Eh,0B0h,060h
+ db 0E8h,0A3h,0D6h,0D6h,07Eh,093h,0D1h,0A1h
+ db 0C4h,0C2h,0D8h,038h,052h,0F2h,0DFh,04Fh
+ db 0F1h,067h,0BBh,0D1h,067h,057h,0BCh,0A6h
+ db 0DDh,006h,0B5h,03Fh,04Bh,036h,0B2h,048h
+ db 0DAh,02Bh,00Dh,0D8h,04Ch,01Bh,00Ah,0AFh
+ db 0F6h,04Ah,003h,036h,060h,07Ah,004h,041h
+ db 0C3h,0EFh,060h,0DFh,055h,0DFh,067h,0A8h
+ db 0EFh,08Eh,06Eh,031h,079h,0BEh,069h,046h
+ db 08Ch,0B3h,061h,0CBh,01Ah,083h,066h,0BCh
+ db 0A0h,0D2h,06Fh,025h,036h,0E2h,068h,052h
+ db 095h,077h,00Ch,0CCh,003h,047h,00Bh,0BBh
+ db 0B9h,016h,002h,022h,02Fh,026h,005h,055h
+ db 0BEh,03Bh,0BAh,0C5h,028h,00Bh,0BDh,0B2h
+ db 092h,05Ah,0B4h,02Bh,004h,06Ah,0B3h,05Ch
+ db 0A7h,0FFh,0D7h,0C2h,031h,0CFh,0D0h,0B5h
+ db 08Bh,09Eh,0D9h,02Ch,01Dh,0AEh,0DEh,05Bh
+ db 0B0h,0C2h,064h,09Bh,026h,0F2h,063h,0ECh
+ db 09Ch,0A3h,06Ah,075h,00Ah,093h,06Dh,002h
+ db 0A9h,006h,009h,09Ch,03Fh,036h,00Eh,0EBh
+ db 085h,067h,007h,072h,013h,057h,000h,005h
+ db 082h,04Ah,0BFh,095h,014h,07Ah,0B8h,0E2h
+ db 0AEh,02Bh,0B1h,07Bh,038h,01Bh,0B6h,00Ch
+ db 09Bh,08Eh,0D2h,092h,00Dh,0BEh,0D5h,0E5h
+ db 0B7h,0EFh,0DCh,07Ch,021h,0DFh,0DBh,00Bh
+ db 0D4h,0D2h,0D3h,086h,042h,0E2h,0D4h,0F1h
+ db 0F8h,0B3h,0DDh,068h,06Eh,083h,0DAh,01Fh
+ db 0CDh,016h,0BEh,081h,05Bh,026h,0B9h,0F6h
+ db 0E1h,077h,0B0h,06Fh,077h,047h,0B7h,018h
+ db 0E6h,05Ah,008h,088h,070h,06Ah,00Fh,0FFh
+ db 0CAh,03Bh,006h,066h,05Ch,00Bh,001h,011h
+ db 0FFh,09Eh,065h,08Fh,069h,0AEh,062h,0F8h
+ db 0D3h,0FFh,06Bh,061h,045h,0CFh,06Ch,016h
+ db 078h,0E2h,00Ah,0A0h,0EEh,0D2h,00Dh,0D7h
+ db 054h,083h,004h,04Eh,0C2h,0B3h,003h,039h
+ db 061h,026h,067h,0A7h,0F7h,016h,060h,0D0h
+ db 04Dh,047h,069h,049h,0DBh,077h,06Eh,03Eh
+ db 04Ah,06Ah,0D1h,0AEh,0DCh,05Ah,0D6h,0D9h
+ db 066h,00Bh,0DFh,040h,0F0h,03Bh,0D8h,037h
+ db 053h,0AEh,0BCh,0A9h,0C5h,09Eh,0BBh,0DEh
+ db 07Fh,0CFh,0B2h,047h,0E9h,0FFh,0B5h,030h
+ db 01Ch,0F2h,0BDh,0BDh,08Ah,0C2h,0BAh,0CAh
+ db 030h,093h,0B3h,053h,0A6h,0A3h,0B4h,024h
+ db 005h,036h,0D0h,0BAh,093h,006h,0D7h,0CDh
+ db 029h,057h,0DEh,054h,0BFh,067h,0D9h,023h
+ db 02Eh,07Ah,066h,0B3h,0B8h,04Ah,061h,0C4h
+ db 002h,01Bh,068h,05Dh,094h,02Bh,06Fh,02Ah
+ db 037h,0BEh,00Bh,0B4h,0A1h,08Eh,00Ch,0C3h
+ db 01Bh,0DFh,005h,05Ah,08Dh,0EFh,002h,02Dh
;
; uninitialized storage
;
@@ -2237,6 +2408,7 @@
ds 24
mtchfcb:
ds 11
+; note that as indicated above, bitbuf must be the byte before bleft
bitbuf: ds 1
vars:
bleft: ds 1
@@ -2250,7 +2422,7 @@
ds 1
D_shift:
ds 1
-V: ds 1
+urV: ds 1
nchar: ds 1
lchar: ds 1
ExState:
@@ -2311,5 +2483,5 @@
disttr: ds 4 * nrdist
endtr:
ds 8192 + 2 - (endtr - lenld)
-
+endaddr: ; must be no vars/data beyond this point
end
--------------- unzip152-rdbybits-and-crc32tab.diff ---------------
And I may as well include the C code to generate the table, again
based on degzip_portable.c:
-------------------- gentable.c --------------------
#include <stdio.h>
int main(void)
{
unsigned long c,i,j;
for(i=0;i<256;i++)
{
if((i&1)==0) printf("\tdb\t");
c=i;
for(j=0;j<8;j++)
c=(c>>1)^((c&1)?0xedb88320:0);
printf("%03Xh,%03Xh,%03Xh,%03Xh",
c&255,(c>>8)&255,(c>>16)&255,(c>>24)&255);
putchar(((i&1)==1)?'\n':',');
}
}
-------------------- gentable.c --------------------
-Rus.