summaryrefslogtreecommitdiffstats
path: root/utf.S
blob: d50e7799f8805bbffa700348930279b389701fee (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
#include "v9fs.h"

	.code16
	.section ".rtext","ax"

/* ------------------------------------------------------------------------- *
 *  cptoutf
 *
 *  Convert a codepage byte in AL to UTF-8 at ES:DI, incrementing DI.
 *  Destroys AX.
 *
 *  This is also used for case conversion, so we do a table lookup even
 *  for plain ASCII characters.
 * ------------------------------------------------------------------------- */

	.globl cptoutf
cptoutf:
	pushw %bx
	movzbw %al,%bx
	addw %bx,%bx
	movw cptoutftbl(%bx),%ax
	cmpw $0x7f,%ax
	ja 1f
	stosb
	popw %bx
	retw
1:
	movw %ax,%bx
	cmpw $0x7ff,%ax
	ja 2f

	/* Two-byte sequence */
	shrw $6,%ax
	orb $0xc0,%al
	stosb
	jmp 3f

2:
	/* Three-byte sequence */
	shrw $12,%ax
	orb $0xe0,%al
	stosb

	movw %bx,%ax
	shrw $6,%ax
	andb $0x3f,%al
	orb $0x80,%al
	stosb
	
3:
	/* Last byte */
	movb %bl,%al
	andb $0x3f,%al
	orb $0x80,%al
	stosb
	popw %bx
	retw

	.size cptoutf,.-cptoutf
	.type cptoutf,@function

/* ------------------------------------------------------------------------- *
 *  utftocp
 *
 *  Convert a UTF-8 sequence in FS:SI to codepage form in AL, returning
 *  CF=1 (and SI undefined) for any error, unknown or noncanonical
 *  character.
 *
 *  For example, U+0041 ('A') is noncanonical, because a codepage 'A'
 *  gets converted to lowercase and transmitted as U+0061 ('a').
 * ------------------------------------------------------------------------- */

utftocp:
	pushw %cx
	pushw %dx

	xorw %ax,%ax
	xorw %cx,%cx
	xorw %dx,%dx
	
	fs lodsb
	andb %al,%al
	jns 1f			/* 1-byte sequence */

	cmpb $0xc2,%al
	jb 99f			/* Error! */

	xorb $0xc0,%al
	cmpb $0x20,%al
	jb 2f			/* 2-byte sequence */

	movb $0x08,%ch		/* U+0800 minimum */
	xorb $0x20,%al
	cmpb $0x10,%al
	jb 3f			/* 3-byte sequence */
99:
	stc
	popw %dx
	popw %cx
	retw
3:
	movw %ax,%dx
	shlw $6,%dx
	fs lodsb
	xorb $0x80,%al
	cmpb $0x40,%al
	jae 99b
	
2:
	addw %ax,%dx
	shlw $6,%dx
	fs lodsb
	xorb $0x80,%al
	cmpb $0x40,%al
	jae 99b
	addw %dx,%ax
1:
	cmpw %cx,%ax
	jb 99b			/* Overlong sequence */

	/* Now %ax has the Unicode code point */
	movw $cptoutftbl,%di
	movb $0x01,%ch
91:
	repne scasw

	notb %cl
	btw %cx,canonmap
	jc 98f			/* 1 in map = canonical = it's good! */
	notb %cl
	jnz 91b
	jmp 99b
98:
	movw %cx,%ax
	popw %dx
	popw %cx
	retw
	
	lrbuf cptoutftbl, 512+32, 2
canonmap	= cptoutftbl + 512
	
/* ------------------------------------------------------------------------- *
 *  Unicode table
 *
 * We compile in a single table (typically 437) - this can be overwritten
 * by data loaded in from a file at install time if necessary
 * ------------------------------------------------------------------------- */

	
	.section ".data","aw"
	.globl unifile
	.balign 2
unifile:
	.incbin "cp/default.uni"
	.size unifile,.-unifile
	.type unifile,@object

/* The unifile has a 32-byte header */
uctable		= unifile + 32		/* Codepage upper-case table */
lctable		= uctable + 256		/* Codepage lower-case table */
unitable	= lctable + 256		/* Codepage to Unicode table */
unicasetable	= unitable + 2*256	/* Codepage case-swap table */
lcbitmap	= unicasetable + 2*256	/* Case-swap is to lower case */

/* ------------------------------------------------------------------------- *
 *  Routines to fill in cptoutftbl
 *
 *  The first time we simply copy unitable - this is used for the tag.
 *  After the tag is parsed, we create the case-converting table.
 *
 *  XXX: These routines should query the system code page and load the
 *  appropriate conversion table if not the compiled-in one.
 * ------------------------------------------------------------------------- */

		.section ".text","ax"

		.globl utf_init_plain
utf_init_plain:
		movw $unitable,%si
		movw $cptoutftbl,%di
		movw $(512/4),%cx
		rep movsl
		retw

		.size utf_init_plain,.-utf_init_plain
		.type utf_init_plain,@function

		.globl utf_init_case

		libuf uctbl, 5, 1
	
utf_init_case:
		pushaw
	
		cmpw $0x31e,dos_version	/* Use case table from DOS? */
		jb 1f
	
		movw $-1,%dx		/* Current country */
		movw %dx,%bx		/* Current code page (XXX) */
		movw $5,%cx		/* Size of buffer */
		movw $uctbl,%di
		movw $0x6504,%ax	/* Get filename upper case table */
		int $0x21
		jc 1f
		cmpb $4,uctbl		/* Table ID = 4 */
		jne 1f
	
		lfsw uctbl+1,%si
		fs lodsw
		cmpw $128,%ax		/* First word is table length = 128 */
		jne 1f			/* If not, assume table is bad */
		movw $uctable+128,%di
		movw $(128/4),%cx
		fs rep movsl
1:
		xorw %bx,%bx
		movw $cptoutftbl,%di
2:
		movzbw uctable(%bx),%si
		btrw %bx,canonmap
		cmpw %bx,%si		/* Is this canonical? */
		jne 3f
		btsw %bx,canonmap	/* 1 in map = canonical */
3:
		btw %si,lcbitmap	/* Is alternate case lower case? */
		jnc 4f
		addw $256,%si		/* Use the alternate case table */
4:
		addw %si,%si
		movw unitable(%si),%ax
		stosw
		incb %bl
		jnz 2b

		popaw
		retw