retromining on z80

Anyone interested in retromining?

Several years ago I implemented SHA-256 on my TI-83. I also implemented AES-128-CFB. I gave up on RSA and DSA because of memory constraints, so I never did create a complete strong-crypto suite for TI-83/z80. With all the interest in bitcoin these days, maybe someone will find my SHA-256 code interesting (or I'll revisit it---although I'd personally find asymmetric crypto or a stronger symmetric block mode more interesting).

If the ~800 bytes/sec from my comments is reliable, the old z80 should do ~7 h/sec, right?

Calculation code:

Code:

; sha256up.z80 by timewave0
; 1twzU46whuMER6hhBPCGmdaLw8atyv9c4
;
; this file constains pure, portable z80 code,
;  no TI-8x ROM calls

; this subroutine updates the sha256 hash
;  in H0 to H7 given a complete block copied
;  to the beginning of WBUF
sha256update:

; second part of message schedule (step 1)
	ld ix,WBUF+64-4
	ld c,16-1
step1:	inc ix
	inc ix			; ix tracks W(t)
	inc ix
	inc ix
	inc c			; counter++

; lowercase sigma_1
	ld h,(ix-8+2)		; rotate 16 bits
	ld l,(ix-8+3)		;  as two bytes
	ld d,(ix-8+0)		;  0123 -> dehl
	ld e,(ix-8+1)
	ld b,1
	call rotrb		; one bit right
	ld a,h
	ld (sigma_temp+0+0),a	; all bytes that
	ld a,l			;  will get xored
	ld (sigma_temp+3+0),a	;  later should
	ld a,d			;  be consecutive
	ld (sigma_temp+6+0),a
	ld a,e
	ld (sigma_temp+9+0),a

	ld b,2
	call rotrb		; two bits right
	ld a,h
	ld (sigma_temp+0+1),a	; all bytes that
	ld a,l			;  will get xored
	ld (sigma_temp+3+1),a	;  later should
	ld a,d			;  be consecutive
	ld (sigma_temp+6+1),a
	ld a,e
	ld (sigma_temp+9+1),a

	ld h,0			; shift 8 bits
	ld l,(ix-8+0)		;  as a byte
	ld d,(ix-8+1)
	ld e,(ix-8+2)
	ld b,2
	call sharb		; two more bits
	ld a,h
	ld (sigma_temp+0+2),a	; all bytes that
	ld a,l			;  will get xored
	ld (sigma_temp+3+2),a	;  later should
	ld a,d			;  be consecutive
	ld (sigma_temp+6+2),a
	ld a,e
	ld (sigma_temp+9+2),a

	ld hl,sigma_temp
	ld a,(hl)
	inc hl
	xor (hl)
	inc hl
	xor (hl)
	ld (ix+0),a

	inc hl
	ld a,(hl)
	inc hl
	xor (hl)
	inc hl
	xor (hl)
	ld (ix+1),a

	inc hl
	ld a,(hl)
	inc hl
	xor (hl)
	inc hl
	xor (hl)
	ld (ix+2),a

	inc hl
	ld a,(hl)
	inc hl
	xor (hl)
	inc hl
	xor (hl)
	ld (ix+3),a
; done with lowercase sigma_1

; lowercase sigma_0
	ld h,(ix-60+3)		; rotate 8 bits
	ld l,(ix-60+0)		;  as one byte
	ld d,(ix-60+1)
	ld e,(ix-60+2)
	ld b,1
	call slcb		; one bit left
	ld a,h
	ld (sigma_temp+0+0),a	; all bytes that
	ld a,l			;  will get xored
	ld (sigma_temp+3+0),a	;  later should
	ld a,d			;  be consecutive
	ld (sigma_temp+6+0),a
	ld a,e
	ld (sigma_temp+9+0),a

	ld a,e
	ld e,d
	ld d,l
	ld l,h
	ld h,a			; hlde -> ehld
	ld b,3
	call rotrb		; three more bits
	ld a,h
	ld (sigma_temp+0+1),a	; all bytes that
	ld a,l			;  will get xored
	ld (sigma_temp+3+1),a	;  later should
	ld a,d			;  be consecutive
	ld (sigma_temp+6+1),a
	ld a,e
	ld (sigma_temp+9+1),a

	ld h,(ix-60+0)
	ld l,(ix-60+1)
	ld d,(ix-60+2)
	ld e,(ix-60+3)
	ld b,3
	call sharb		; three more bits
	ld a,h
	ld (sigma_temp+0+2),a	; all bytes that
	ld a,l			;  will get xored
	ld (sigma_temp+3+2),a	;  later should
	ld a,d			;  be consecutive
	ld (sigma_temp+6+2),a
	ld a,e
	ld (sigma_temp+9+2),a

	ld hl,sigma_temp
	ld a,(hl)
	inc hl
	xor (hl)
	inc hl
	xor (hl)
	ld d,a

	inc hl
	ld a,(hl)
	inc hl
	xor (hl)
	inc hl
	xor (hl)
	ld e,a

	inc hl
	ld a,(hl)
	inc hl
	xor (hl)
	inc hl
	xor (hl)
	ld b,a

	inc hl
	ld a,(hl)
	inc hl
	xor (hl)
	inc hl
	xor (hl)

	add a,(ix+3)
	ld (ix+3),a
	ld a,(ix+2)
	adc a,b
	ld (ix+2),a
	ld a,(ix+1)
	adc a,e
	ld (ix+1),a
	ld a,(ix+0)
	adc a,d
	ld (ix+0),a

	push ix

	push ix
	pop hl
	inc hl
	inc hl
	inc hl
	ld de,3-28
	add ix,de
	call add32		; += W_t-7

	inc hl
	inc hl
	inc hl
	ld de,28-64
	add ix,de
	call add32		; += W_t-16

	pop ix

	ld a,63
	cp c			; repeat?
	jp nc,step1
; done with message schedule

; step 2 is a simple copy, made even easier by
;  keeping [A-H] and H[0-7] together in memory
	ld hl,H0
	ld de,ABUF
	ld bc,32
	ldir
; wow, that was easy

; step 3 is the hardest part of the algorithm
	xor a			; start with t = 0
	ld (step3_t),a		; counter is step3_t
step_3:

; uppercase sigma_1 of e
	ld ix,EBUF

	ld h,(ix+3)
	ld l,(ix+0)
	ld d,(ix+1)
	ld e,(ix+2)
	ld b,2
	call slcb		; net 6 bits right
	ld a,h
	ld (sigma_temp+0+0),a	; all bytes that
	ld a,l			;  will get xored
	ld (sigma_temp+3+0),a	;  later should
	ld a,d			;  be consecutive
	ld (sigma_temp+6+0),a
	ld a,e
	ld (sigma_temp+9+0),a

	ld a,e
	ld e,d
	ld d,l
	ld l,h
	ld h,a
	ld b,3
	call slcb		; 6+8-3 bits right
	ld a,h
	ld (sigma_temp+0+1),a	; all bytes that
	ld a,l			;  will get xored
	ld (sigma_temp+3+1),a	;  later should
	ld a,d			;  be consecutive
	ld (sigma_temp+6+1),a
	ld a,e
	ld (sigma_temp+9+1),a

	ex de,hl
	ld b,2
	call slcb		; 6+8-3+16-2
	ld a,h
	ld (sigma_temp+0+2),a	; all bytes that
	ld a,l			;  will get xored
	ld (sigma_temp+3+2),a	;  later should
	ld a,d			;  be consecutive
	ld (sigma_temp+6+2),a
	ld a,e
	ld (sigma_temp+9+2),a

	ld hl,sigma_temp
	ld a,(hl)
	inc hl
	xor (hl)
	inc hl
	xor (hl)
	ld (TEMP1+0),a

	inc hl
	ld a,(hl)
	inc hl
	xor (hl)
	inc hl
	xor (hl)
	ld (TEMP1+1),a

	inc hl
	ld a,(hl)
	inc hl
	xor (hl)
	inc hl
	xor (hl)
	ld (TEMP1+2),a

	inc hl
	ld a,(hl)
	inc hl
	xor (hl)
	inc hl
	xor (hl)
	ld (TEMP1+3),a
; done with uppercase sigma_1 of e

	ld ix,HBUF+3
	ld hl,TEMP1+3
	call add32		; T_1 += HBUF

	ld ix,WBUF+3
	ld a,(step3_t)
	ld b,0
	sla a
	rl b			; *= 2
	sla a
	rl b			; *= 2
	ld c,a
	add ix,bc
	ld hl,TEMP1+3
	call add32		; T_1 += W_t

	ld ix,Karray+3
	add ix,bc
	ld hl,TEMP1+3
	call add32		; T_1 += K_t

; use g xor (e and (f xor g)) for Ch
	ld ix,EBUF

	ld a,(ix+8+0)		; GBUF
	xor (ix+4+0)		; xor FBUF
	and (ix+0+0)		; and EBUF
	xor (ix+8+0)		; xor GBUF
	ld d,a
	ld a,(ix+8+1)		; GBUF
	xor (ix+4+1)		; xor FBUF
	and (ix+0+1)		; and EBUF
	xor (ix+8+1)		; xor GBUF
	ld e,a
	ld a,(ix+8+2)		; GBUF
	xor (ix+4+2)		; xor FBUF
	and (ix+0+2)		; and EBUF
	xor (ix+8+2)		; xor GBUF
	ld b,a
	ld a,(ix+8+3)		; GBUF
	xor (ix+4+3)		; xor FBUF
	and (ix+0+3)		; and EBUF
	xor (ix+8+3)		; xor GBUF

	add a,(ix-20+3)
	ld (ix-20+3),a
	ld a,(ix-20+2)
	adc a,b
	ld (ix-20+2),a
	ld a,(ix-20+1)
	adc a,e
	ld (ix-20+1),a
	ld a,(ix-20+0)
	adc a,d
	ld (ix-20+0),a		; TEMP1 += Ch

; uppercase sigma_0 of a
	ld ix,ABUF

	ld h,(ix+0)
	ld l,(ix+1)
	ld d,(ix+2)		;  0123 -> hlde
	ld e,(ix+3)
	ld b,2
	call rotrb		; two bits right
	ld a,h
	ld (sigma_temp+0+0),a	; all bytes that
	ld a,l			;  will get xored
	ld (sigma_temp+3+0),a	;  later should
	ld a,d			;  be consecutive
	ld (sigma_temp+6+0),a
	ld a,e
	ld (sigma_temp+9+0),a

	ld a,e
	ld e,d
	ld d,l
	ld l,h
	ld h,a
	ld b,3
	call rotrb		; 2+8+3
	ld a,h
	ld (sigma_temp+0+1),a	; all bytes that
	ld a,l			;  will get xored
	ld (sigma_temp+3+1),a	;  later should
	ld a,d			;  be consecutive
	ld (sigma_temp+6+1),a
	ld a,e
	ld (sigma_temp+9+1),a

	ld a,e
	ld e,d
	ld d,l
	ld l,h
	ld h,a
	ld b,1
	call rotrb		; 2+8+3+8+1
	ld a,h
	ld (sigma_temp+0+2),a	; all bytes that
	ld a,l			;  will get xored
	ld (sigma_temp+3+2),a	;  later should
	ld a,d			;  be consecutive
	ld (sigma_temp+6+2),a
	ld a,e
	ld (sigma_temp+9+2),a

	ld hl,sigma_temp
	ld a,(hl)
	inc hl
	xor (hl)
	inc hl
	xor (hl)
	ld (TEMP2+0),a

	inc hl
	ld a,(hl)
	inc hl
	xor (hl)
	inc hl
	xor (hl)
	ld (TEMP2+1),a

	inc hl
	ld a,(hl)
	inc hl
	xor (hl)
	inc hl
	xor (hl)
	ld (TEMP2+2),a

	inc hl
	ld a,(hl)
	inc hl
	xor (hl)
	inc hl
	xor (hl)
	ld (TEMP2+3),a
; done with uppercase sigma_0 of a

; use (a and b) or (c and (a or b)) for Maj
	; ix still points to ABUF

	ld a,(ix+0+0)		; ABUF
	or (ix+4+0)		; or BBUF
	and (ix+8+0)		; and CBUF
	ld d,a
	ld a,(ix+0+0)		; ABUF
	and (ix+4+0)		; and BBUF
	or d
	ld d,a
	ld a,(ix+0+1)		; ABUF
	or (ix+4+1)		; or BBUF
	and (ix+8+1)		; and CBUF
	ld e,a
	ld a,(ix+0+1)		; ABUF
	and (ix+4+1)		; and BBUF
	or e
	ld e,a
	ld a,(ix+0+2)		; ABUF
	or (ix+4+2)		; or BBUF
	and (ix+8+2)		; and CBUF
	ld b,a
	ld a,(ix+0+2)		; ABUF
	and (ix+4+2)		; and BBUF
	or b
	ld b,a
	ld a,(ix+0+3)		; ABUF
	or (ix+4+3)		; or BBUF
	and (ix+8+3)		; and CBUF
	ld c,a
	ld a,(ix+0+3)		; ABUF
	and (ix+4+3)		; and BBUF
	or c

	add a,(ix+32+3)
	ld (ix+32+3),a
	ld a,(ix+32+2)
	adc a,b
	ld (ix+32+2),a
	ld a,(ix+32+1)
	adc a,e
	ld (ix+32+1),a
	ld a,(ix+32+0)
	adc a,d
	ld (ix+32+0),a		; TEMP2 += Maj

	ld de,HBUF+3		; H=G, G=F, F=E,
	ld hl,GBUF+3		; E=D, D=C, C=B,
	ld bc,32		; B=A, A=T1
	lddr

	ld hl,EBUF+3
	ld ix,TEMP1+3
	call add32		; E += T1

	ld hl,ABUF+3
	ld ix,TEMP2+3
	call add32		; A += T2

	ld a,(step3_t)
	inc a
	ld (step3_t),a
	cp 64			; loop 64 times
	jp c,step_3

; part 4 is all 32-bit addition
	ld hl,H7+3
	ld ix,HBUF+3
	call add32

	ld ix,GBUF+3
	dec hl			; remember add32
	call add32		;  does hl -= 3

	ld ix,FBUF+3
	dec hl
	call add32

	ld ix,EBUF+3
	dec hl
	call add32

	ld ix,DBUF+3
	dec hl
	call add32

	ld ix,CBUF+3
	dec hl
	call add32

	ld ix,BBUF+3
	dec hl
	call add32

	ld ix,ABUF+3
	dec hl
	call add32
; end of not-so-difficult part 4

	ret
; end of sha256update subroutine


; subroutine to perform a 32-bit circular
;  right shift of hlde iterated b times
rotrb:	srl h
	rr l
	rr d
	rr e
	jr nc,ncrce
	set 7,h
ncrce:	djnz rotrb
	ret
; the result is in hlde


; subroutine to perform a 32-bit
;  right shift of hlde iterated b times
sharb:	srl h
	rr l
	rr d
	rr e
ncrh:	djnz sharb
	ret
; the result is in hlde


; subroutine to perform a 32-bit circular
; left shift of hlde iterated b times
slcb:	sla e
	rl d
	rl l
	rl h
	jr nc,nclh
	inc e
nclh:	djnz slcb
	ret
; the result is in hlde


; subroutine to perform 32-bit addition
; adds (ix) to (hl)
; both should initially point to the LSB
add32:	ld a,(hl)
	add a,(ix+0)		; LSB
	ld (hl),a
	dec hl
	ld a,(hl)
	adc a,(ix-1)		; add with carry
	ld (hl),a
	dec hl
	ld a,(hl)
	adc a,(ix-2)
	ld (hl),a
	dec hl
	ld a,(hl)
	adc a,(ix-3)		; MSB
	ld (hl),a
	ret
; hl changes, and a is clobbered


Str1: .db $04,$AA,$00

; initial values
H0init: .db $6a, $09, $e6, $67
H1init: .db $bb, $67, $ae, $85
H2init: .db $3c, $6e, $f3, $72
H3init: .db $a5, $4f, $f5, $3a
H4init: .db $51, $0e, $52, $7f
H5init: .db $9b, $05, $68, $8c
H6init: .db $1f, $83, $d9, $ab
H7init: .db $5b, $e0, $cd, $19

; constants
Karray:	.db $42,$8a,$2f,$98,$71,$37,$44,$91,
	.db $b5,$c0,$fb,$cf,$e9,$b5,$db,$a5,
	.db $39,$56,$c2,$5b,$59,$f1,$11,$f1,
	.db $92,$3f,$82,$a4,$ab,$1c,$5e,$d5,
	.db $d8,$07,$aa,$98,$12,$83,$5b,$01,
	.db $24,$31,$85,$be,$55,$0c,$7d,$c3,
	.db $72,$be,$5d,$74,$80,$de,$b1,$fe,
	.db $9b,$dc,$06,$a7,$c1,$9b,$f1,$74,
	.db $e4,$9b,$69,$c1,$ef,$be,$47,$86,
	.db $0f,$c1,$9d,$c6,$24,$0c,$a1,$cc,
	.db $2d,$e9,$2c,$6f,$4a,$74,$84,$aa,
	.db $5c,$b0,$a9,$dc,$76,$f9,$88,$da,
	.db $98,$3e,$51,$52,$a8,$31,$c6,$6d,
	.db $b0,$03,$27,$c8,$bf,$59,$7f,$c7,
	.db $c6,$e0,$0b,$f3,$d5,$a7,$91,$47,
	.db $06,$ca,$63,$51,$14,$29,$29,$67,
	.db $27,$b7,$0a,$85,$2e,$1b,$21,$38,
	.db $4d,$2c,$6d,$fc,$53,$38,$0d,$13,
	.db $65,$0a,$73,$54,$76,$6a,$0a,$bb,
	.db $81,$c2,$c9,$2e,$92,$72,$2c,$85,
	.db $a2,$bf,$e8,$a1,$a8,$1a,$66,$4b,
	.db $c2,$4b,$8b,$70,$c7,$6c,$51,$a3,
	.db $d1,$92,$e8,$19,$d6,$99,$06,$24,
	.db $f4,$0e,$35,$85,$10,$6a,$a0,$70,
	.db $19,$a4,$c1,$16,$1e,$37,$6c,$08,
	.db $27,$48,$77,$4c,$34,$b0,$bc,$b5,
	.db $39,$1c,$0c,$b3,$4e,$d8,$aa,$4a,
	.db $5b,$9c,$ca,$4f,$68,$2e,$6f,$f3,
	.db $74,$8f,$82,$ee,$78,$a5,$63,$6f,
	.db $84,$c8,$78,$14,$8c,$c7,$02,$08,
	.db $90,$be,$ff,$fa,$a4,$50,$6c,$eb,
	.db $be,$f9,$a3,$f7,$c6,$71,$78,$f2

Interface code:

Code:

; sha256.z80 by timewave0
; 1twzU46whuMER6hhBPCGmdaLw8atyv9c4
;
;  see FIPS PUB 180-2
;
; ~800 bytes/sec for very large
;  programs, as measured on Vti
;
; verified for random byte patterns of lengths:
;  0-10, 50-70, 255-257, 16383-16385, 19000

.LIST
_chkfindsym	.equ $442A
_findsym	.equ $442E
_zeroop1	.equ $428E
_errundefined	.equ $467B
_errsyntax	.equ $466C
_createstrng	.equ $4472
_delvar		.equ $44AA

OP1		.equ $8039

progobj		.equ $05
strngobj	.equ $04

block		.equ $8265	; magic number
extra_bytes	.equ block+2
H0		.equ extra_bytes+1
H1		.equ H0+4
H2		.equ H1+4
H3		.equ H2+4
H4		.equ H3+4
H5		.equ H4+4
H6		.equ H5+4
H7		.equ H6+4
; start of variables that can't be moved in memory
TEMP1		.equ H7+4
ABUF		.equ TEMP1+4
BBUF		.equ ABUF+4
CBUF		.equ BBUF+4
DBUF		.equ CBUF+4
EBUF		.equ DBUF+4
FBUF		.equ EBUF+4
GBUF		.equ FBUF+4
HBUF		.equ GBUF+4
TEMP2		.equ HBUF+4
; end of variables that can't be moved in memory
WBUF		.equ TEMP2+4
sigma_temp	.equ WBUF+(4*64)
dataptr		.equ sigma_temp+(3*4)
step3_t		.equ dataptr+2
size		.equ step3_t+1

.org $9327			; magic number

	call clearvars		; clear variables

; based on code from squish by Pat Milheron
	call _zeroop1
	ld hl,Str1
	ld de,OP1
	ld bc,3
	ldir
	call _findsym		; lookup Str1
	jp c,_errundefined

	and $1F
	cp strngobj		; is it a sring?
	jp nz,_errsyntax

	ld hl,op1
	ld (hl),progobj
	inc hl
	ld a,(de)		; size of name
	ld c,a
	ld b,0
	inc de
	inc de			; (de) is name ptr
	ex de,hl
	ldir			; name to op1

	call _chkfindsym	; size ptr -> de
	jp c,_errundefined
; end of squish-based code

	ex de,hl
	ld e,(hl)		; LSB -> e
	inc hl
	ld d,(hl)		; MSB -> d
	inc hl
	ld (size),de
	ld (dataptr),hl

	ld b,6
	xor a
div64:	srl d
	rr e
	rr a			; save remainder
	djnz div64
	srl a
	srl a

	ld (block),de		; # of _whole_ blocks
	ld (extra_bytes),a

	ld hl,H0init
	ld de,H0
	ld bc,64
	ldir

do_hash:			; a label or advice?
	ld hl,(block)
	xor a
	cp h
	jr nz,no_check_l
	cp l
	jr z,do_padding
no_check_l:

	dec hl
	ld (block),hl		; block--
	ld hl,(dataptr)
	ld de,WBUF
	ld bc,64
	ldir			; copy block
	ld (dataptr),hl		; dataptr += 64

	call sha256update
	jr do_hash

do_padding:
	ld a,(extra_bytes)
	ld b,0
	ld c,a
	ld hl,(dataptr)
	ld de,WBUF
	cp 0
	jr z,no_cpy
	ldir
no_cpy:	ex de,hl
	ld (hl),$80
	inc hl
	cp 63
	jr z,need_another_block

load_0s:
	neg
	add a,63		; 64-1-extra_bytes
	ld b,a
zero_fill:
	ld (hl),0
	inc hl
	djnz zero_fill

	cp 8			; room for length?
	jr c,need_another_block
; since the message length can't possibly be more than
;  16 bits, I'm safe reusing the 32-bit circular shift
;  from the hash code to multiply by 8
	ld de,(size)
	ld hl,$0000		; ensure 0s shift in
	ld b,3
	call slcb
	ld a,e
	ld (WBUF+63),a
	ld a,d
	ld (WBUF+62),a
	ld a,l
	ld (WBUF+61),a

	call sha256update
	jr done

need_another_block:
	call sha256update
	xor a
	ld hl,WBUF
	jr load_0s		; do another block


; call this guy when we're done hashing
done:	call _zeroop1
	ld hl,Str1
	ld de,OP1
	ld bc,3
	ldir
	call _chkfindsym
	call _delvar
	ld hl,2*256/8
	call _createstrng	; recreate Str1
	inc de
	ld b,256/8
	ld hl,H0-1
store:	inc hl
	ld a,(hl)
	push af
	and $F0
	srl a
	srl a
	srl a
	srl a
	call store_hex		; high nibble

	pop af
	and $0F
	call store_hex		; low nibble

	djnz store

	call clearvars

	ret
; end of done subroutine


.include "sha256up.z80"		; the fun stuff


; subroutine to clear all variables
clearvars:
	ld hl,block
	xor a
	ld (hl),a
	ld de,block+1
	ld bc,size-block	; last var - start
	ldir
	ret
; end of clearvars subroutine


; subroutine to store a character of the hash
store_hex:
	inc de
	cp $A
	jr nc,letter
	add a,'0'
	ld (de),a
	ret
letter: add a,'A'-$A
	ld (de),a
	ret
; end of hex character store subroutine

.end