Codebase list libencode-zapcp1252-perl / cfe1814
New upstream version 0.40 Niko Tyni 3 years ago
11 changed file(s) with 258 addition(s) and 143 deletion(s). Raw diff Collapse all Expand all
1515 },
1616 meta_merge => {
1717 resources => {
18 homepage => 'http://search.cpan.org/dist/Encode-CP1252/',
19 bugtracker => 'http://github.com/theory/encode-cp1252/issues/',
20 repository => 'http://github.com/theory/encode-cp1252/tree',
18 homepage => 'https://search.cpan.org/dist/Encode-ZapCP1252/',
19 bugtracker => 'https://github.com/theory/encode-zapcp1252/issues/',
20 repository => 'https://github.com/theory/encode-zapcp1252',
2121 }
2222 },
2323 );
00 Revision history for Perl extension Encode::ZapCP1252
1
2 0.40 2020-02-04T15:30:50Z
3 - Added link to Encoding::FixLatin. Suggested by Jonas Smedegaard.
4 - Removed Pod tests from the distribution.
5 - Fixed an issue discovered on Perl 5.31 that incorrectly resulted in
6 the creation of malformed UTF-8 when fixing unicode strings. Thanks to
7 Karl Williamson for the continuing improvement of Unicode support in
8 Perl and for the fix to this module.
19
210 0.33 2011-11-23T05:19:36Z
311 - Require Test::Pod 1.41 to support `L<text|url>` syntax in the Pod.
1523 0.30 2010-06-12T18:05:38
1624 - The conversion functions now ignore `undef` arguments and just return
1725 without doing anything.
18 - Strings are no longer modifed in-place unless the conversion
26 - Strings are no longer modified in-place unless the conversion
1927 subroutines are called in a void context.
20 - The conversion functions may optionally be called with no arugment
28 - The conversion functions may optionally be called with no argument
2129 when run in Perl 5.10 or higher, in which case they will instead act
2230 on `$_`.
2331
2432 0.20 2010-06-12T00:39:35
25 - Added `local` to examples of changing the maping tables.
26 - When the Encode module is insatlled, zapping and fixing CP1252
33 - Added `local` to examples of changing the mapping tables.
34 - When the Encode module is installed, zapping and fixing CP1252
2735 gremlins now works in decoded strings, too.
2836 - For convenience, the functions now return the strings they've
2937 modified.
3038 - Shipping with a traditional `Makefile.PL` rather than one that
3139 passes through to Module::Build.
32 - Moved repository to [GitHub](http://github.com/theory/encode-zapcp1252).
40 - Moved repository to [GitHub](https://github.com/theory/encode-zapcp1252).
3341
3442 0.12 2008-06-23T17:48:04
3543 - Fixed pasto in the "Support" section of the docs.
36 - Fixed a typo in the "Synopsis" section of the docs, thaks to David
44 - Fixed a typo in the "Synopsis" section of the docs, thanks to David
3745 Beaudet.
3846 - Fixed the 5.6.2 requirement to be properly detected in Perl 5.5.
3947 Thanks to Slaven Rezic for the report.
00 Build.PL
11 Changes
22 lib/Encode/ZapCP1252.pm
3 Makefile.PL
34 MANIFEST This list of files
4 README
5 META.json
6 META.yml
7 README.md
58 t/base.t
69 t/decoded.t
710 t/perl-510.t
8 t/pod.t
9 Makefile.PL
10 META.yml
11 META.json
33 "David E. Wheeler <david@justatheory.com>"
44 ],
55 "dynamic_config" : 1,
6 "generated_by" : "Module::Build version 0.38, CPAN::Meta::Converter version 2.112150",
6 "generated_by" : "Module::Build version 0.4229",
77 "license" : [
88 "perl_5"
99 ],
1010 "meta-spec" : {
1111 "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",
12 "version" : "2"
12 "version" : 2
1313 },
1414 "name" : "Encode-ZapCP1252",
1515 "prereqs" : {
2626 },
2727 "runtime" : {
2828 "recommends" : {
29 "Encode" : 0,
29 "Encode" : "0",
3030 "Test::Pod" : "1.41"
3131 },
3232 "requires" : {
3737 "provides" : {
3838 "Encode::ZapCP1252" : {
3939 "file" : "lib/Encode/ZapCP1252.pm",
40 "version" : "0.33"
40 "version" : "0.40"
4141 }
4242 },
4343 "release_status" : "stable",
4444 "resources" : {
4545 "bugtracker" : {
46 "web" : "http://github.com/theory/encode-cp1252/issues/"
46 "web" : "https://github.com/theory/encode-zapcp1252/issues/"
4747 },
48 "homepage" : "http://search.cpan.org/dist/Encode-CP1252/",
48 "homepage" : "https://search.cpan.org/dist/Encode-ZapCP1252/",
4949 "license" : [
5050 "http://dev.perl.org/licenses/"
5151 ],
5252 "repository" : {
53 "url" : "http://github.com/theory/encode-cp1252/tree"
53 "url" : "https://github.com/theory/encode-zapcp1252"
5454 }
5555 },
56 "version" : "0.33"
56 "version" : "0.40",
57 "x_serialization_backend" : "JSON::PP version 4.02"
5758 }
22 author:
33 - 'David E. Wheeler <david@justatheory.com>'
44 build_requires:
5 Module::Build: 0.36
6 Test::More: 0.17
5 Module::Build: '0.36'
6 Test::More: '0.17'
77 configure_requires:
8 Module::Build: 0.36
8 Module::Build: '0.36'
99 dynamic_config: 1
10 generated_by: 'Module::Build version 0.38, CPAN::Meta::Converter version 2.112150'
10 generated_by: 'Module::Build version 0.4229, CPAN::Meta::Converter version 2.150010'
1111 license: perl
1212 meta-spec:
1313 url: http://module-build.sourceforge.net/META-spec-v1.4.html
14 version: 1.4
14 version: '1.4'
1515 name: Encode-ZapCP1252
1616 provides:
1717 Encode::ZapCP1252:
1818 file: lib/Encode/ZapCP1252.pm
19 version: 0.33
19 version: '0.40'
2020 recommends:
21 Encode: 0
22 Test::Pod: 1.41
21 Encode: '0'
22 Test::Pod: '1.41'
2323 requires:
24 perl: 5.006002
24 perl: '5.006002'
2525 resources:
26 bugtracker: http://github.com/theory/encode-cp1252/issues/
27 homepage: http://search.cpan.org/dist/Encode-CP1252/
26 bugtracker: https://github.com/theory/encode-zapcp1252/issues/
27 homepage: https://search.cpan.org/dist/Encode-ZapCP1252/
2828 license: http://dev.perl.org/licenses/
29 repository: http://github.com/theory/encode-cp1252/tree
30 version: 0.33
29 repository: https://github.com/theory/encode-zapcp1252
30 version: '0.40'
31 x_serialization_backend: 'CPAN::Meta::YAML version 0.018'
0 # Note: this file was auto-generated by Module::Build::Compat version 0.3800
0 # Note: this file was auto-generated by Module::Build::Compat version 0.4229
11 require 5.006002;
22 use ExtUtils::MakeMaker;
33 WriteMakefile
44 (
5 'NAME' => 'Encode::ZapCP1252',
5 'PL_FILES' => {},
6 'INSTALLDIRS' => 'site',
67 'VERSION_FROM' => 'lib/Encode/ZapCP1252.pm',
78 'PREREQ_PM' => {
89 'Module::Build' => '0.36',
910 'Test::More' => '0.17'
1011 },
11 'INSTALLDIRS' => 'site',
1212 'EXE_FILES' => [],
13 'PL_FILES' => {}
13 'NAME' => 'Encode::ZapCP1252'
1414 )
1515 ;
+0
-44
README less more
0 Encode/CP1252 version 0.33
1 ==========================
2
3 Have you ever been processing a Web form submit, assuming that the incoming
4 text was encoded in ISO-8859-1 (Latin-1), only to end up with a bunch of junk
5 because someone pasted in content from Microsoft Word? Well, this is because
6 Microsoft uses a superset of the Latin-1 encoding called "Windows Western" or
7 "CP1252". So mostly things will come out right, but a few things--like curly
8 quotes, m-dashes, ellipses, and the like--will not. The differences are
9 well-known; you see a nice chart at documenting the differences on
10 [Wikipedia](http://en.wikipedia.org/wiki/Windows-1252).
11
12 Of course, that won't really help you. So this library's module,
13 Encode::ZapCP1252, provides subroutines for removing Windows Western Gremlins
14 from strings, turning them into their appropriate UTF-8 or ASCII
15 approximations:
16
17 my $clean_latin1 = zap_cp1252 $latin1_text;
18 my $fixed_utf8 = fix_cp1252 $utf8_text;
19
20 Installation
21 ------------
22
23 To install this module, type the following:
24
25 perl Build.PL
26 ./Build
27 ./Build test
28 ./Build install
29
30 Or, if you don't have Module::Build installed, type the following:
31
32 perl Makefile.PL
33 make
34 make test
35 make install
36
37 Copyright and Licence
38 ---------------------
39
40 Copyright (c) 2005-2010 David E. Wheeler. Some Rights Reserved.
41
42 This module is free software; you can redistribute it and/or modify it under
43 the same terms as Perl itself.
0 Encode/CP1252 version 0.40
1 ==========================
2
3 [![CPAN version](https://badge.fury.io/pl/Encode-CP1252.svg)](https://badge.fury.io/pl/Encode-CP1252)
4 [![Build Status](https://github.com/theory/encode-zapcp1252/workflows/CI/badge.svg)](/theory/encode-zapcp1252/actions/)
5
6 Have you ever been processing a Web form submit, assuming that the incoming
7 text was encoded in ISO-8859-1 (Latin-1), only to end up with a bunch of junk
8 because someone pasted in content from Microsoft Word? Well, this is because
9 Microsoft uses a superset of the Latin-1 encoding called "Windows Western" or
10 "CP1252". So mostly things will come out right, but a few things--like curly
11 quotes, m-dashes, ellipses, and the like--will not. The differences are
12 well-known; you see a nice chart at documenting the differences on
13 [Wikipedia](https://en.wikipedia.org/wiki/Windows-1252).
14
15 Of course, that won't really help you. So this library's module,
16 Encode::ZapCP1252, provides subroutines for removing Windows Western Gremlins
17 from strings, turning them into their appropriate UTF-8 or ASCII
18 approximations:
19
20 my $clean_latin1 = zap_cp1252 $latin1_text;
21 my $fixed_utf8 = fix_cp1252 $utf8_text;
22
23 Installation
24 ------------
25
26 To install this module, type the following:
27
28 perl Build.PL
29 ./Build
30 ./Build test
31 ./Build install
32
33 Or, if you don't have Module::Build installed, type the following:
34
35 perl Makefile.PL
36 make
37 make test
38 make install
39
40 Copyright and Licence
41 ---------------------
42
43 Copyright (c) 2005-2020 David E. Wheeler. Some Rights Reserved.
44
45 This module is free software; you can redistribute it and/or modify it under
46 the same terms as Perl itself.
44 use vars qw($VERSION @ISA @EXPORT);
55 use 5.006_002;
66
7 $VERSION = '0.33';
7 $VERSION = '0.40';
88 @ISA = qw(Exporter);
99 @EXPORT = qw(zap_cp1252 fix_cp1252);
1010 use constant PERL588 => $] >= 5.008_008;
11 require Encode if PERL588;
11 use Encode ();
1212
1313 our %ascii_for = (
14 # http://en.wikipedia.org/wiki/Windows-1252
14 # https://en.wikipedia.org/wiki/Windows-1252
1515 "\x80" => 'e', # EURO SIGN
1616 "\x82" => ',', # SINGLE LOW-9 QUOTATION MARK
1717 "\x83" => 'f', # LATIN SMALL LETTER F WITH HOOK
4242 );
4343
4444 our %utf8_for = (
45 # http://en.wikipedia.org/wiki/Windows-1252
45 # https://en.wikipedia.org/wiki/Windows-1252
4646 "\x80" => '€', # EURO SIGN
4747 "\x82" => ',', # SINGLE LOW-9 QUOTATION MARK
4848 "\x83" => 'ƒ', # LATIN SMALL LETTER F WITH HOOK
7272 "\x9f" => 'Ÿ', # LATIN CAPITAL LETTER Y WITH DIAERESIS
7373 );
7474
75 my @utf8_skip = (
76 # This translates a utf-8-encoded byte into how many bytes the full utf8
77 # character occupies. Illegal start bytes have a negative count.
78
79 # UTF-8 is a variable-length encoding. The 128 ASCII characters were very
80 # deliberately set to be themselves, so UTF-8 would be backwards compatible
81 # with 7-bit applications. Every other character has 2 - 13 bytes comprising
82 # it.
83 #
84 # If the first bit of the first byte in a character is 0, it is one of those
85 # 128 ASCII characters with length 1.
86
87 # Otherwise, the first bit is 1, and if the second bit is also one, this byte
88 # starts the sequence of bytes that represent the character. The bytes C0-FF
89 # have the characteristic that the first two bits are both one. The number of
90 # bytes that form a character corresponds to the number of consecutive leading
91 # bits that are all one in the start byte. In the case of FE, the first 7
92 # bits are one, so the number of bytes in the character it represents is 7.
93 # FF is a special case, and Perl has arbitrarily set it to 13 instead of the
94 # expected 8.
95 #
96 # The remaining bytes begin with '10', from 80..9F. They are called
97 # continuation bytes, and a UTF-8 character is comprised of a start byte
98 # indicating 'n' bytes total in it, then 'n-1' of these continuation bytes.
99 # What the character is that each sequence represents is derived by shifting
100 # and adding the other bits in the bytes. (C0 and C1 aren't actually legal
101 # start bytes for security reasons that need not concern us here, hence are
102 # marked as negative in the table below.)
103
104 # 0 1 2 3 4 5 6 7 8 9 A B C D E F
105 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0
106 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 1
107 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 2
108 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 3
109 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 4
110 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 5
111 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 6
112 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 7
113 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, # 8
114 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, # 9
115 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, # A
116 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, # B
117 -1,-1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # C
118 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # D
119 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # E
120 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7,13, # F
121 );
122
75123 BEGIN {
76124 my $proto = $] >= 5.010000 ? '_' : '$';
77125 eval "sub zap_cp1252($proto) { unshift \@_, \\%ascii_for; &_tweakit; }";
78126 eval "sub fix_cp1252($proto) { unshift \@_, \\%utf8_for; &_tweakit; }";
79127 }
80128
129 # These are the bytes that CP1252 redefines
130 my $cp1252_re = qr/[\x80\x82-\x8c\x8e\x91-\x9c\x9e\x9f]/;
131
81132 sub _tweakit {
82133 my $table = shift;
83134 return unless defined $_[0];
84135 local $_[0] = $_[0] if defined wantarray;
85 if (PERL588 && Encode::is_utf8($_[0])) {
86 _tweak_decoded($table, $_[0]);
87 } else {
88 $_[0] =~ s{([\x80-\x9f])}{$table->{$1} || $1}emxsg;
136 my $is_utf8 = PERL588 && Encode::is_utf8($_[0]);
137 my $valid_utf8 = $is_utf8 && utf8::valid($_[0]);
138 if (!$is_utf8) {
139
140 # Here is non-UTF-8. Change the 1252 characters to their UTF-8
141 # counterparts. These bytes are very rarely used in real world
142 # applications, so their presence likely indicates that CP1252 was
143 # meant.
144 $_[0] =~ s/($cp1252_re)/$table->{$1}/gems;
145 } elsif ($valid_utf8) {
146
147 # Here is well-formed Perl extended UTF-8 and has the UTF-8 flag on
148 # and the string is held as bytes. Change the 1252 characters to their
149 # Unicode counterparts.
150 $_[0] =~ s/($cp1252_re)/Encode::decode_utf8($table->{$1})/gems;
151 } else { # Invalid UTF-8. Look for single-byte CP1252 gremlins
152
153 # Turn off the UTF-8 flag so that we can go through the string
154 # byte-by-byte.
155 Encode::_utf8_off($_[0]);
156
157 my $i = 0;
158 my $length = length $_[0];
159 my $fixed = ""; # The input after being fixed up by this loop
160 while ($i < $length) {
161
162 # Each time through the loop, we should here be ready to look at a
163 # new character, and it's 0th byte is called a 'start byte'
164 my $start_byte = substr($_[0], $i, 1);
165 my $skip = $utf8_skip[ord $start_byte];
166
167 # The table is set up so that legal UTF-8 start bytes have a
168 # positive byte length. Simply add all the bytes in the character
169 # to the output, and go on to handle the next character in the
170 # next loop iteration.
171 if ($skip > 0) {
172 $fixed .= substr($_[0], $i, $skip);
173 $i += $skip;
174 next;
175 }
176
177 # Here we have a byte that isn't a start byte in a position that
178 # should oughta be a start byte. The whole point of this loop is
179 # to find such bytes that are CP1252 ones and which were
180 # incorrectly inserted by the upstream process into an otherwise
181 # valid UTF-8 string. So, if we have such a one, change it into
182 # its corresponding correct character.
183 if ($start_byte =~ s/($cp1252_re)/$table->{$1}/ems) {
184
185 # The correct character may be UTF-8 bytes. We treat them as
186 # just a sequence of non-UTF-8 bytes, because that's what
187 # $fixed has in it so far. After everything is consistently
188 # added, we turn the UTF-8 flag back on before returning at
189 # the end.
190 Encode::_utf8_off($start_byte);
191 $fixed .= $start_byte;
192 $i++;
193 next;
194 }
195
196 # Here the byte isn't a CP1252 one.
197 die "Unexpected continuation byte: %02x", ord $start_byte;
198 }
199
200 # $fixed now has everything properly in it, but set to return it in
201 # $_[0], marked as UTF-8.
202 $_[0] = $fixed;
203 Encode::_utf8_on($_[0]);
89204 }
90205 return $_[0] if defined wantarray;
91 }
92
93 sub _tweak_decoded {
94 my $table = shift;
95 local $@;
96 # First, try to replace in the decoded string.
97 eval {
98 $_[0] =~ s{([\x80-\x9f])}{
99 $table->{$1} ? Encode::decode('UTF-8', $table->{$1}) : $1
100 }emxsg
101 };
102 if (my $err = $@) {
103 # If we got a "Malformed UTF-8 character" error, then someone
104 # likely turned on the utf8 flag without decoding. So turn it off.
105 # and try again.
106 die if $err !~ /Malformed/;
107 Encode::_utf8_off($_[0]);
108 $_[0] =~ s/([\x80-\x9f])/$table->{$1} || $1/emxsg;
109 Encode::_utf8_on($_[0]);
110 }
111206 }
112207
113208 1;
141236 encoding is Latin-1, mostly things will come out right, but a few things--like
142237 curly quotes, m-dashes, ellipses, and the like--may not. The differences are
143238 well-known; you see a nice chart at documenting the differences on
144 L<Wikipedia|http://en.wikipedia.org/wiki/Windows-1252>.
239 L<Wikipedia|https://en.wikipedia.org/wiki/Windows-1252>.
145240
146241 Of course, that won't really help you. What will help you is to quit using
147242 Latin-1 and switch to UTF-8. Then you can just convert from CP1252 to UTF-8
160255 gremlins mixed in with properly encoded characters. I've seen examples of just
161256 this sort of thing when processing GMail messages and attempting to insert
162257 them into a UTF-8 database, as well as in some feeds processed by, say
163 L<Yahoo! Pipes|http://pipes.yahoo.com>. Doesn't work so well. For such cases,
164 there's C<fix_cp1252>, which converts those CP1252 gremlins into their UTF-8
165 equivalents.
258 Yahoo! Pipes. Doesn't work so well. For such cases, there's C<fix_cp1252>,
259 which converts those CP1252 gremlins into their UTF-8 equivalents.
166260
167261 =head1 Usage
168262
186280
187281 In this case, even constant values can be processed. Either way, C<undef>s
188282 will be ignored.
283
284 In Perl 5.10 and higher, the functions may optionally be called with no
285 arguments, in which case C<$_> will be converted, instead:
286
287 zap_cp1252; # Modify $_ in-place.
288 fix_cp1252; # Modify $_ in-place.
289 my $zapped = zap_cp1252; # Copy $_ and return zapped
290 my $fixed = zap_cp1252; # Copy $_ and return fixed
189291
190292 In Perl 5.8.8 and higher, the conversion will work even when the string is
191293 decoded to Perl's internal form (usually via C<decode 'ISO-8859-1', $text>) or
196298 removing those CP1252 gremlins no matter what kind of processing has already
197299 been executed on the string.
198300
199 In Perl 5.10 and higher, the functions may optionally be called with no
200 arguments, in which case C<$_> will be converted, instead:
201
202 zap_cp1252; # Modify $_ in-place.
203 fix_cp1252; # Modify $_ in-place.
204 my $zapped = zap_cp1252; # Copy $_ and return zapped
205 my $fixed = zap_cp1252; # Copy $_ and return fixed
301 That said, although C<fix_cp1252()> takes a conservative approach to replacing
302 text in Unicode strings, it should be used as a very last option. Really,
303 avoid that situation if you can.
206304
207305 =head1 Conversion Table
208306
250348
251349 local $Encode::ZapCP1252::ascii_for{"\x80"} = 'E';
252350
253 Or if, for some bizarre reason, you wanted the UTF-8 equivalent for a bullet
254 converted by C<fix_cp1252()> to really be an asterisk (why would you? Just use
255 C<zap_cp1252> for that!), you can do this:
256
257 local $Encode::ZapCP1252::utf8_for{"\x95"} = '*';
351 Or if, for some reason, you wanted the UTF-8 equivalent for a bullet
352 converted by C<fix_cp1252()> to be a black square, you can assign the
353 bytes (never a Unicode string) like so:
354
355 local $Encode::ZapCP1252::utf8_for{"\x95"} = Encode::encode_utf8('■');
258356
259357 Just remember, without C<local> this would be a global change. In that case,
260358 be careful if your code zaps CP1252 elsewhere. Of course, it shouldn't really
268366
269367 =item L<Encode>
270368
271 =item L<Wikipedia: Windows-1252|http://en.wikipedia.org/wiki/Windows-1252>
369 =item L<Encoding::FixLatin>
370
371 =item L<Wikipedia: Windows-1252|https://en.wikipedia.org/wiki/Windows-1252>
272372
273373 =back
274374
275375 =head1 Support
276376
277377 This module is stored in an open L<GitHub
278 repository|http://github.com/theory/encode-cp1252/tree/>. Feel free to fork
378 repository|https://github.com/theory/encode-zapcp1252/>. Feel free to fork
279379 and contribute!
280380
281381 Please file bug reports via L<GitHub
282 Issues|http://github.com/theory/encode-cp1252/issues/> or by sending mail to
382 Issues|https://github.com/theory/encode-zapcp1252/issues/> or by sending mail to
283383 L<bug-Encode-CP1252@rt.cpan.org|mailto:bug-Encode-CP1252@rt.cpan.org>.
284384
285385 =head1 Author
289389 =head1 Acknowledgments
290390
291391 My thanks to Sean Burke for sending me his original method for converting
292 CP1252 gremlins to more-or-less appropriate ASCII characters.
392 CP1252 gremlins to more-or-less appropriate ASCII characters, and to Karl
393 Williamson for more correct handling of Unicode strings.
293394
294395 =head1 Copyright and License
295396
296 Copyright (c) 2005-2010 David E. Wheeler. Some Rights Reserved.
397 Copyright (c) 2005-2020 David E. Wheeler. Some Rights Reserved.
297398
298399 This module is free software; you can redistribute it and/or modify it under the
299400 same terms as Perl itself.
55 BEGIN {
66 plan skip_all => 'These tests require Perl 5.8.8 or higher'
77 unless $] >= 5.008_008;
8 plan tests => 6;
8 plan tests => 10;
99 }
1010
1111 BEGIN { use_ok 'Encode::ZapCP1252' or die; }
1414 my $ascii = q{e , f ,, ... + ++ ^ % S < OE Z ' ' " " * - -- ~ (tm) s > oe z Y};
1515 my $utf8 = q{€ , ƒ „ … † ‡ ˆ ‰ Š ‹ Œ Ž ‘ ’ “ ” • – — ˜ ™ š › œ ž Ÿ};
1616
17 # Test conversion of decoded from ISO-8859-1.
17 # Test conversion of text decoded from ISO-8859-1.
1818 my $fix_me = Encode::decode(
1919 'ISO-8859-1',
2020 join ' ', map { chr } 0x80, 0x82 .. 0x8c, 0x8e, 0x91 .. 0x9c, 0x9e, 0x9f
4444 is $fix_me, $ascii, 'Convert utf8-bit-flipped to ascii';
4545
4646 # Test conversion to decoded with modified table.
47 my $euro = $Encode::ZapCP1252::utf8_for{"\x80"};
4748 $Encode::ZapCP1252::utf8_for{"\x80"} = 'E';
4849 $utf8 =~ s/€/E/;
4950
5556 fix_cp1252 $fix_me;
5657 is $fix_me, $utf8, 'Convert decoded from Latin-1 with modified table';
5758
59 # Test it with the valid use of one of the gremlins (π is [0xcf,0x80]) in UTF-8.
60 is fix_cp1252 'π', 'π', 'Should not convert valid use of 0x80';
61 is zap_cp1252 'π', 'π', 'Should not zap valid use of 0x80';
5862
63 # But it should convert it if it's not UTF-8.
64 my $utf8_euro = Encode::encode_utf8($euro);
65 $Encode::ZapCP1252::utf8_for{"\x80"} = $utf8_euro;
66 is fix_cp1252 "\xCF\x80", "\xCF" . $utf8_euro,
67 'Should convert 0x80 when not parsing UTF-8';
68 is zap_cp1252 "\xCF\x80", qq{\xCF$Encode::ZapCP1252::ascii_for{"\x80"}},
69 'Should convert 0x80 to ASCII when not parsing UTF-8';
+0
-9
t/pod.t less more
0 #!perl -w
1
2 use strict;
3 use Test::More;
4 eval 'use Test::Pod 1.41';
5 plan skip_all => 'Test::Pod 1.41 required for testing POD' if $@;
6 eval 'use Encode';
7 plan skip_all => 'Encode 1.20 required for testing POD because it has UTF-8 characters' if $@;
8 all_pod_files_ok();