4 | 4 |
use vars qw($VERSION @ISA @EXPORT);
|
5 | 5 |
use 5.006_002;
|
6 | 6 |
|
7 | |
$VERSION = '0.33';
|
|
7 |
$VERSION = '0.40';
|
8 | 8 |
@ISA = qw(Exporter);
|
9 | 9 |
@EXPORT = qw(zap_cp1252 fix_cp1252);
|
10 | 10 |
use constant PERL588 => $] >= 5.008_008;
|
11 | |
require Encode if PERL588;
|
|
11 |
use Encode ();
|
12 | 12 |
|
13 | 13 |
our %ascii_for = (
|
14 | |
# http://en.wikipedia.org/wiki/Windows-1252
|
|
14 |
# https://en.wikipedia.org/wiki/Windows-1252
|
15 | 15 |
"\x80" => 'e', # EURO SIGN
|
16 | 16 |
"\x82" => ',', # SINGLE LOW-9 QUOTATION MARK
|
17 | 17 |
"\x83" => 'f', # LATIN SMALL LETTER F WITH HOOK
|
|
42 | 42 |
);
|
43 | 43 |
|
44 | 44 |
our %utf8_for = (
|
45 | |
# http://en.wikipedia.org/wiki/Windows-1252
|
|
45 |
# https://en.wikipedia.org/wiki/Windows-1252
|
46 | 46 |
"\x80" => '€', # EURO SIGN
|
47 | 47 |
"\x82" => ',', # SINGLE LOW-9 QUOTATION MARK
|
48 | 48 |
"\x83" => 'ƒ', # LATIN SMALL LETTER F WITH HOOK
|
|
72 | 72 |
"\x9f" => 'Ÿ', # LATIN CAPITAL LETTER Y WITH DIAERESIS
|
73 | 73 |
);
|
74 | 74 |
|
|
75 |
my @utf8_skip = (
|
|
76 |
# This translates a utf-8-encoded byte into how many bytes the full utf8
|
|
77 |
# character occupies. Illegal start bytes have a negative count.
|
|
78 |
|
|
79 |
# UTF-8 is a variable-length encoding. The 128 ASCII characters were very
|
|
80 |
# deliberately set to be themselves, so UTF-8 would be backwards compatible
|
|
81 |
# with 7-bit applications. Every other character has 2 - 13 bytes comprising
|
|
82 |
# it.
|
|
83 |
#
|
|
84 |
# If the first bit of the first byte in a character is 0, it is one of those
|
|
85 |
# 128 ASCII characters with length 1.
|
|
86 |
|
|
87 |
# Otherwise, the first bit is 1, and if the second bit is also one, this byte
|
|
88 |
# starts the sequence of bytes that represent the character. The bytes C0-FF
|
|
89 |
# have the characteristic that the first two bits are both one. The number of
|
|
90 |
# bytes that form a character corresponds to the number of consecutive leading
|
|
91 |
# bits that are all one in the start byte. In the case of FE, the first 7
|
|
92 |
# bits are one, so the number of bytes in the character it represents is 7.
|
|
93 |
# FF is a special case, and Perl has arbitrarily set it to 13 instead of the
|
|
94 |
# expected 8.
|
|
95 |
#
|
|
96 |
# The remaining bytes begin with '10', from 80..9F. They are called
|
|
97 |
# continuation bytes, and a UTF-8 character is comprised of a start byte
|
|
98 |
# indicating 'n' bytes total in it, then 'n-1' of these continuation bytes.
|
|
99 |
# What the character is that each sequence represents is derived by shifting
|
|
100 |
# and adding the other bits in the bytes. (C0 and C1 aren't actually legal
|
|
101 |
# start bytes for security reasons that need not concern us here, hence are
|
|
102 |
# marked as negative in the table below.)
|
|
103 |
|
|
104 |
# 0 1 2 3 4 5 6 7 8 9 A B C D E F
|
|
105 |
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0
|
|
106 |
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 1
|
|
107 |
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 2
|
|
108 |
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 3
|
|
109 |
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 4
|
|
110 |
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 5
|
|
111 |
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 6
|
|
112 |
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 7
|
|
113 |
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, # 8
|
|
114 |
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, # 9
|
|
115 |
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, # A
|
|
116 |
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, # B
|
|
117 |
-1,-1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C
|
|
118 |
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D
|
|
119 |
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E
|
|
120 |
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7,13, # F
|
|
121 |
);
|
|
122 |
|
75 | 123 |
BEGIN {
|
76 | 124 |
my $proto = $] >= 5.010000 ? '_' : '$';
|
77 | 125 |
eval "sub zap_cp1252($proto) { unshift \@_, \\%ascii_for; &_tweakit; }";
|
78 | 126 |
eval "sub fix_cp1252($proto) { unshift \@_, \\%utf8_for; &_tweakit; }";
|
79 | 127 |
}
|
80 | 128 |
|
|
129 |
# These are the bytes that CP1252 redefines
|
|
130 |
my $cp1252_re = qr/[\x80\x82-\x8c\x8e\x91-\x9c\x9e\x9f]/;
|
|
131 |
|
81 | 132 |
sub _tweakit {
|
82 | 133 |
my $table = shift;
|
83 | 134 |
return unless defined $_[0];
|
84 | 135 |
local $_[0] = $_[0] if defined wantarray;
|
85 | |
if (PERL588 && Encode::is_utf8($_[0])) {
|
86 | |
_tweak_decoded($table, $_[0]);
|
87 | |
} else {
|
88 | |
$_[0] =~ s{([\x80-\x9f])}{$table->{$1} || $1}emxsg;
|
|
136 |
my $is_utf8 = PERL588 && Encode::is_utf8($_[0]);
|
|
137 |
my $valid_utf8 = $is_utf8 && utf8::valid($_[0]);
|
|
138 |
if (!$is_utf8) {
|
|
139 |
|
|
140 |
# Here is non-UTF-8. Change the 1252 characters to their UTF-8
|
|
141 |
# counterparts. These bytes are very rarely used in real world
|
|
142 |
# applications, so their presence likely indicates that CP1252 was
|
|
143 |
# meant.
|
|
144 |
$_[0] =~ s/($cp1252_re)/$table->{$1}/gems;
|
|
145 |
} elsif ($valid_utf8) {
|
|
146 |
|
|
147 |
# Here is well-formed Perl extended UTF-8 and has the UTF-8 flag on
|
|
148 |
# and the string is held as bytes. Change the 1252 characters to their
|
|
149 |
# Unicode counterparts.
|
|
150 |
$_[0] =~ s/($cp1252_re)/Encode::decode_utf8($table->{$1})/gems;
|
|
151 |
} else { # Invalid UTF-8. Look for single-byte CP1252 gremlins
|
|
152 |
|
|
153 |
# Turn off the UTF-8 flag so that we can go through the string
|
|
154 |
# byte-by-byte.
|
|
155 |
Encode::_utf8_off($_[0]);
|
|
156 |
|
|
157 |
my $i = 0;
|
|
158 |
my $length = length $_[0];
|
|
159 |
my $fixed = ""; # The input after being fixed up by this loop
|
|
160 |
while ($i < $length) {
|
|
161 |
|
|
162 |
# Each time through the loop, we should here be ready to look at a
|
|
163 |
# new character, and it's 0th byte is called a 'start byte'
|
|
164 |
my $start_byte = substr($_[0], $i, 1);
|
|
165 |
my $skip = $utf8_skip[ord $start_byte];
|
|
166 |
|
|
167 |
# The table is set up so that legal UTF-8 start bytes have a
|
|
168 |
# positive byte length. Simply add all the bytes in the character
|
|
169 |
# to the output, and go on to handle the next character in the
|
|
170 |
# next loop iteration.
|
|
171 |
if ($skip > 0) {
|
|
172 |
$fixed .= substr($_[0], $i, $skip);
|
|
173 |
$i += $skip;
|
|
174 |
next;
|
|
175 |
}
|
|
176 |
|
|
177 |
# Here we have a byte that isn't a start byte in a position that
|
|
178 |
# should oughta be a start byte. The whole point of this loop is
|
|
179 |
# to find such bytes that are CP1252 ones and which were
|
|
180 |
# incorrectly inserted by the upstream process into an otherwise
|
|
181 |
# valid UTF-8 string. So, if we have such a one, change it into
|
|
182 |
# its corresponding correct character.
|
|
183 |
if ($start_byte =~ s/($cp1252_re)/$table->{$1}/ems) {
|
|
184 |
|
|
185 |
# The correct character may be UTF-8 bytes. We treat them as
|
|
186 |
# just a sequence of non-UTF-8 bytes, because that's what
|
|
187 |
# $fixed has in it so far. After everything is consistently
|
|
188 |
# added, we turn the UTF-8 flag back on before returning at
|
|
189 |
# the end.
|
|
190 |
Encode::_utf8_off($start_byte);
|
|
191 |
$fixed .= $start_byte;
|
|
192 |
$i++;
|
|
193 |
next;
|
|
194 |
}
|
|
195 |
|
|
196 |
# Here the byte isn't a CP1252 one.
|
|
197 |
die "Unexpected continuation byte: %02x", ord $start_byte;
|
|
198 |
}
|
|
199 |
|
|
200 |
# $fixed now has everything properly in it, but set to return it in
|
|
201 |
# $_[0], marked as UTF-8.
|
|
202 |
$_[0] = $fixed;
|
|
203 |
Encode::_utf8_on($_[0]);
|
89 | 204 |
}
|
90 | 205 |
return $_[0] if defined wantarray;
|
91 | |
}
|
92 | |
|
93 | |
sub _tweak_decoded {
|
94 | |
my $table = shift;
|
95 | |
local $@;
|
96 | |
# First, try to replace in the decoded string.
|
97 | |
eval {
|
98 | |
$_[0] =~ s{([\x80-\x9f])}{
|
99 | |
$table->{$1} ? Encode::decode('UTF-8', $table->{$1}) : $1
|
100 | |
}emxsg
|
101 | |
};
|
102 | |
if (my $err = $@) {
|
103 | |
# If we got a "Malformed UTF-8 character" error, then someone
|
104 | |
# likely turned on the utf8 flag without decoding. So turn it off.
|
105 | |
# and try again.
|
106 | |
die if $err !~ /Malformed/;
|
107 | |
Encode::_utf8_off($_[0]);
|
108 | |
$_[0] =~ s/([\x80-\x9f])/$table->{$1} || $1/emxsg;
|
109 | |
Encode::_utf8_on($_[0]);
|
110 | |
}
|
111 | 206 |
}
|
112 | 207 |
|
113 | 208 |
1;
|
|
141 | 236 |
encoding is Latin-1, mostly things will come out right, but a few things--like
|
142 | 237 |
curly quotes, m-dashes, ellipses, and the like--may not. The differences are
|
143 | 238 |
well-known; you see a nice chart at documenting the differences on
|
144 | |
L<Wikipedia|http://en.wikipedia.org/wiki/Windows-1252>.
|
|
239 |
L<Wikipedia|https://en.wikipedia.org/wiki/Windows-1252>.
|
145 | 240 |
|
146 | 241 |
Of course, that won't really help you. What will help you is to quit using
|
147 | 242 |
Latin-1 and switch to UTF-8. Then you can just convert from CP1252 to UTF-8
|
|
160 | 255 |
gremlins mixed in with properly encoded characters. I've seen examples of just
|
161 | 256 |
this sort of thing when processing GMail messages and attempting to insert
|
162 | 257 |
them into a UTF-8 database, as well as in some feeds processed by, say
|
163 | |
L<Yahoo! Pipes|http://pipes.yahoo.com>. Doesn't work so well. For such cases,
|
164 | |
there's C<fix_cp1252>, which converts those CP1252 gremlins into their UTF-8
|
165 | |
equivalents.
|
|
258 |
Yahoo! Pipes. Doesn't work so well. For such cases, there's C<fix_cp1252>,
|
|
259 |
which converts those CP1252 gremlins into their UTF-8 equivalents.
|
166 | 260 |
|
167 | 261 |
=head1 Usage
|
168 | 262 |
|
|
186 | 280 |
|
187 | 281 |
In this case, even constant values can be processed. Either way, C<undef>s
|
188 | 282 |
will be ignored.
|
|
283 |
|
|
284 |
In Perl 5.10 and higher, the functions may optionally be called with no
|
|
285 |
arguments, in which case C<$_> will be converted, instead:
|
|
286 |
|
|
287 |
zap_cp1252; # Modify $_ in-place.
|
|
288 |
fix_cp1252; # Modify $_ in-place.
|
|
289 |
my $zapped = zap_cp1252; # Copy $_ and return zapped
|
|
290 |
my $fixed = zap_cp1252; # Copy $_ and return fixed
|
189 | 291 |
|
190 | 292 |
In Perl 5.8.8 and higher, the conversion will work even when the string is
|
191 | 293 |
decoded to Perl's internal form (usually via C<decode 'ISO-8859-1', $text>) or
|
|
196 | 298 |
removing those CP1252 gremlins no matter what kind of processing has already
|
197 | 299 |
been executed on the string.
|
198 | 300 |
|
199 | |
In Perl 5.10 and higher, the functions may optionally be called with no
|
200 | |
arguments, in which case C<$_> will be converted, instead:
|
201 | |
|
202 | |
zap_cp1252; # Modify $_ in-place.
|
203 | |
fix_cp1252; # Modify $_ in-place.
|
204 | |
my $zapped = zap_cp1252; # Copy $_ and return zapped
|
205 | |
my $fixed = zap_cp1252; # Copy $_ and return fixed
|
|
301 |
That said, although C<fix_cp1252()> takes a conservative approach to replacing
|
|
302 |
text in Unicode strings, it should be used as a very last option. Really,
|
|
303 |
avoid that situation if you can.
|
206 | 304 |
|
207 | 305 |
=head1 Conversion Table
|
208 | 306 |
|
|
250 | 348 |
|
251 | 349 |
local $Encode::ZapCP1252::ascii_for{"\x80"} = 'E';
|
252 | 350 |
|
253 | |
Or if, for some bizarre reason, you wanted the UTF-8 equivalent for a bullet
|
254 | |
converted by C<fix_cp1252()> to really be an asterisk (why would you? Just use
|
255 | |
C<zap_cp1252> for that!), you can do this:
|
256 | |
|
257 | |
local $Encode::ZapCP1252::utf8_for{"\x95"} = '*';
|
|
351 |
Or if, for some reason, you wanted the UTF-8 equivalent for a bullet
|
|
352 |
converted by C<fix_cp1252()> to be a black square, you can assign the
|
|
353 |
bytes (never a Unicode string) like so:
|
|
354 |
|
|
355 |
local $Encode::ZapCP1252::utf8_for{"\x95"} = Encode::encode_utf8('■');
|
258 | 356 |
|
259 | 357 |
Just remember, without C<local> this would be a global change. In that case,
|
260 | 358 |
be careful if your code zaps CP1252 elsewhere. Of course, it shouldn't really
|
|
268 | 366 |
|
269 | 367 |
=item L<Encode>
|
270 | 368 |
|
271 | |
=item L<Wikipedia: Windows-1252|http://en.wikipedia.org/wiki/Windows-1252>
|
|
369 |
=item L<Encoding::FixLatin>
|
|
370 |
|
|
371 |
=item L<Wikipedia: Windows-1252|https://en.wikipedia.org/wiki/Windows-1252>
|
272 | 372 |
|
273 | 373 |
=back
|
274 | 374 |
|
275 | 375 |
=head1 Support
|
276 | 376 |
|
277 | 377 |
This module is stored in an open L<GitHub
|
278 | |
repository|http://github.com/theory/encode-cp1252/tree/>. Feel free to fork
|
|
378 |
repository|https://github.com/theory/encode-zapcp1252/>. Feel free to fork
|
279 | 379 |
and contribute!
|
280 | 380 |
|
281 | 381 |
Please file bug reports via L<GitHub
|
282 | |
Issues|http://github.com/theory/encode-cp1252/issues/> or by sending mail to
|
|
382 |
Issues|https://github.com/theory/encode-zapcp1252/issues/> or by sending mail to
|
283 | 383 |
L<bug-Encode-CP1252@rt.cpan.org|mailto:bug-Encode-CP1252@rt.cpan.org>.
|
284 | 384 |
|
285 | 385 |
=head1 Author
|
|
289 | 389 |
=head1 Acknowledgments
|
290 | 390 |
|
291 | 391 |
My thanks to Sean Burke for sending me his original method for converting
|
292 | |
CP1252 gremlins to more-or-less appropriate ASCII characters.
|
|
392 |
CP1252 gremlins to more-or-less appropriate ASCII characters, and to Karl
|
|
393 |
Williamson for more correct handling of Unicode strings.
|
293 | 394 |
|
294 | 395 |
=head1 Copyright and License
|
295 | 396 |
|
296 | |
Copyright (c) 2005-2010 David E. Wheeler. Some Rights Reserved.
|
|
397 |
Copyright (c) 2005-2020 David E. Wheeler. Some Rights Reserved.
|
297 | 398 |
|
298 | 399 |
This module is free software; you can redistribute it and/or modify it under the
|
299 | 400 |
same terms as Perl itself.
|