diff options
author | H. Peter Anvin <hpa@zytor.com> | 2013-02-25 19:10:15 -0800 |
---|---|---|
committer | H. Peter Anvin <hpa@zytor.com> | 2013-02-25 19:10:15 -0800 |
commit | 952ffec34648dcd77b15744b3d8c660112022c0d (patch) | |
tree | 3d75174e53d686668912965788a83b25fa2b6b67 /cp | |
parent | 1dd96271ad8fd8fb7661f27cbff8d425a8b9d698 (diff) | |
download | virtio9p-952ffec34648dcd77b15744b3d8c660112022c0d.tar.gz virtio9p-952ffec34648dcd77b15744b3d8c660112022c0d.tar.xz virtio9p-952ffec34648dcd77b15744b3d8c660112022c0d.zip |
checkpoint: cp-to-utf conversion now includes case conversion
Diffstat (limited to 'cp')
-rw-r--r-- | cp/Makefile | 4 | ||||
-rwxr-xr-x | cp/cptable.pl | 160 |
2 files changed, 140 insertions, 24 deletions
diff --git a/cp/Makefile b/cp/Makefile index 90fa950..276f2f4 100644 --- a/cp/Makefile +++ b/cp/Makefile @@ -10,8 +10,8 @@ all: $(GENFILES) # This generates codepage files where the display and filesystem # codepages are both the same. -%.uni: %.txt cptable.pl - $(PERL) cptable.pl $< $@ +%.uni: %.txt cptable.pl UnicodeData + $(PERL) cptable.pl UnicodeData $< $< $@ default.uni: $(DEFAULT).uni cp -f $< $@ diff --git a/cp/cptable.pl b/cp/cptable.pl index 19e72a0..89c4040 100755 --- a/cp/cptable.pl +++ b/cp/cptable.pl @@ -11,62 +11,178 @@ # http://www.unicode.org/Public/UNIDATA/UCD.html # -($cpfs, $cpout) = @ARGV; +($ucd, $cpco, $cpfs, $cpout) = @ARGV; if (!defined($cpout)) { - die "Usage: $0 UnicodeData cp.txt output.cp\n"; + die "Usage: $0 UnicodeData console-cp.txt fs-cp.txt output.cp\n"; } +%ucase = (); +%lcase = (); +%tcase = (); +%decomp = (); + +open(UCD, '<', $ucd) + or die "$0: could not open unicode data: $ucd: $!\n"; +while (defined($line = <UCD>)) { + chomp $line; + if ($line =~ /Name\:.*\bcp([0-9]+)\b/i) { + $cpnum = $1 + 0; + } + @f = split(/;/, $line); + $n = hex $f[0]; + $ucase{$n} = ($f[12] ne '') ? hex $f[12] : $n; + $lcase{$n} = ($f[13] ne '') ? hex $f[13] : $n; + $tcase{$n} = ($f[14] ne '') ? hex $f[14] : $n; + if ($f[5] =~ /^[0-9A-F\s]+$/) { + # This character has a canonical decomposition. + # The regular expression rejects angle brackets, so other + # decompositions aren't permitted. + $decomp{$n} = []; + foreach my $dch (split(' ', $f[5])) { + push(@{$decomp{$n}}, hex $dch); + } + } +} +close(UCD); + # -# Read the code pages +# Filesystem and console codepages. The filesystem codepage is used +# for FAT shortnames, whereas the console codepage is whatever is used +# on the screen and keyboard. # @xtab = (undef) x 256; %tabx = (); open(CPFS, '<', $cpfs) or die "$0: could not open fs codepage: $cpfs: $!\n"; while (defined($line = <CPFS>)) { - if ($line =~ /\#.*Name\:.*\bcp([0-9]+)/i) { - $cpnum = $1 + 0; - next; - } $line =~ s/\s*(\#.*|)$//; @f = split(/\s+/, $line); next if (scalar @f != 2); next if (hex $f[0] > 255); $xtab[hex $f[0]] = hex $f[1]; # Codepage -> Unicode - $tabx[hex $f[1]] = hex $f[0]; # Unicode -> Codepage + $tabx{hex $f[1]} = hex $f[0]; # Unicode -> Codepage } close(CPFS); -# -# Check that ASCII is 1:1 -# -for ($i = 0; $i < 128; $i++) { - if ($xtab[$i] != $i) { - die "$0: code page file $cpfs does not identity-map ASCII\n"; - } +@ytab = (undef) x 256; +%taby = (); +open(CPCO, '<', $cpco) + or die "$0: could not open console codepage: $cpco: $!\n"; +while (defined($line = <CPCO>)) { + $line =~ s/\s*(\#.*|)$//; + @f = split(/\s+/, $line); + next if (scalar @f != 2); + next if (hex $f[0] > 255); + $ytab[hex $f[0]] = hex $f[1]; # Codepage -> Unicode + $taby{hex $f[1]} = hex $f[0]; # Unicode -> Codepage } +close(CPCO); open(CPOUT, '>', $cpout) or die "$0: could not open output file: $cpout: $!\n"; +# +# Magic number, in anticipation of being able to load these +# files dynamically... +# +print CPOUT pack("VV", 0x58a8b3d4, 0x51d21eb1); +# Header fields (some available for future use) +print CPOUT pack("VVVVVV", $cpnum, 0, 0, 0, 0, 0); + +# +# Self (shortname) uppercase table. +# This depends both on the console codepage and the filesystem codepage; +# the logical transcoding operation is: +# +# $tabx{$ucase{$ytab[$i]}} # -# Write a simple header +# ... where @ytab is console codepage -> Unicode and +# %tabx is Unicode -> filesystem codepage. # -print CPOUT pack("Vvv", 0x8740f99c, $cpnum, 0); +@uctab = (undef) x 256; +for ($i = 0; $i < 256; $i++) { + $uuc = $ucase{$ytab[$i]}; # Unicode upper case + if (defined($tabx{$uuc})) { + # Straight-forward conversion + $u = $tabx{$uuc}; + } elsif (defined($tabx{${$decomp{$uuc}}[0]})) { + # Upper case equivalent stripped of accents + $u = $tabx{${$decomp{$uuc}}[0]}; + } else { + # No equivalent at all found. Assume it is a lower-case-only + # character, like greek alpha in CP437. + $u = $i; + } + $uctab[$i] = $u; + print CPOUT pack("C", $u); +} + +# +# Self (shortname) lowercase table. +# This depends both on the console codepage and the filesystem codepage; +# the logical transcoding operation is: +# +# $taby{$lcase{$xtab[$i]}} +# +# ... where @ytab is console codepage -> Unicode and +# %tabx is Unicode -> filesystem codepage. +# +@lctab = (undef) x 256; +for ($i = 0; $i < 256; $i++) { + $llc = $lcase{$xtab[$i]}; # Unicode lower case + if (defined($l = $taby{$llc}) && $uctab[$l] == $i) { + # Straight-forward conversion + } elsif (defined($l = $tabx{${$decomp{$llc}}[0]}) && $uctab[$l] == $i) { + # Lower case equivalent stripped of accents + } else { + # No equivalent at all found. Find *anything* that matches the + # bijection criterion... + for ($l = 0; $l < 256; $l++) { + last if ($uctab[$l] == $i); + } + $l = $i if ($l == 256); # If nothing, we're screwed anyway... + } + $lctab[$i] = $l; + print CPOUT pack("C", $l); +} # # Unicode (longname) matching table. # This only depends on the console codepage. # -for ($i = 128; $i < 256; $i++) { - $p0 = $xtab[$i]; - if (!defined($xtab[$i]) || $p0 > 0xffff) { - $p0 = 0xffff; +$pp0 = ''; $pp1 = ''; +@lcbitmap = (0) x 32; + +for ($i = 0; $i < 256; $i++) { + if (!defined($ytab[$i])) { + $p0 = $p1 = 0xffff; + } else { + $p0 = $ytab[$i]; + if ($ucase{$p0} != $p0) { + $p1 = $ucase{$p0}; + } elsif ($lcase{$p0} != $p0) { + $p1 = $lcase{$p0}; + $lcbitmap[$i >> 3] |= (1 << ($i & 7)) if ($p1 < 0xffff); + } elsif ($tcase{$p0} != $p0) { + $p1 = $tcase{$p0}; + } else { + $p1 = $p0; + } } + # Only the BMP is supported... + $p0 = 0xffff if ($p0 > 0xffff); + $p1 = 0xffff if ($p1 > 0xffff); $pp0 .= pack("v", $p0); + $pp1 .= pack("v", $p1); } -print CPOUT $pp0; +print CPOUT $pp0, $pp1; + +# +# Finally, bitmap indicating which altcharacters are lower case +# +print CPOUT pack("C*", @lcbitmap); + close (CPOUT); |