Imported Upstream version 1.10
gregor herrmann
9 years ago
0 | 0 | Revision history for Perl extension HTML::Strip. |
1 | ||
2 | 1.10 Tue Sep 30 14:34:47 UTC 2014 | |
3 | - Fix for RT#99207 (script mathematical symbol bug). | |
4 | ||
5 | 1.09 Tue Sep 30 10:39:47 UTC 2014 | |
6 | - offbyone.t disabled under Windows (RT#99219) | |
7 | ||
8 | 1.08 Fri Sep 26 15:02:37 UTC 2014 | |
9 | - system perl used in offbyone.t (RT#99151) | |
10 | ||
11 | 1.07 Tue Sep 23 14:44:08 UTC 2014 | |
12 | - fix to bug RT#19036 - tags not replaced with spaces when only a single | |
13 | character is between the tags | |
14 | - fix to bug RT#35345 - mathematical conparisons within <script> tags | |
15 | misunderstood | |
16 | (patches contributed by Adriano Ferreira) | |
17 | - Exporter was never needed | |
18 | - Allow other filtering operations than just decoding of HTML entities | |
19 | - Modernised test suite | |
20 | - Adds 'auto_reset' attribute, which allows automagic use of $hs->eof | |
21 | - fixes quotes in html comments (RT#32355) | |
22 | (patch contributed by Reini Urban) | |
23 | - MSVC doesnt define strcasecmp, use stricmp instead | |
24 | (patch contributed by Damyan Ivanov) | |
25 | - fixes POD errors | |
1 | 26 | |
2 | 27 | 1.06 Fri Feb 10 11:18:35 2006 |
3 | 28 | - documented 'set_decode_entities' method |
0 | 0 | Changes |
1 | 1 | Makefile.PL |
2 | 2 | MANIFEST |
3 | META.yml | |
3 | 4 | README |
4 | 5 | Strip.pm |
5 | 6 | Strip.xs |
6 | 7 | strip_html.h |
7 | 8 | strip_html.c |
8 | 9 | typemap |
9 | test.pl | |
10 | t/basic.t | |
11 | t/auto-reset.t | |
12 | t/comment.t | |
13 | t/edge-case.t | |
14 | t/filter.t | |
15 | t/mathematical-comparisons.t | |
16 | t/offbyone.t | |
17 | t/striptags.t | |
18 | t/whitespace-single-char.t |
0 | --- | |
1 | abstract: 'Perl extension for stripping HTML markup from text.' | |
2 | author: | |
3 | - 'Alex Bowley <kilinrax@cpan.org>' | |
4 | build_requires: | |
5 | ExtUtils::MakeMaker: 0 | |
6 | configure_requires: | |
7 | ExtUtils::MakeMaker: 0 | |
8 | distribution_type: module | |
9 | dynamic_config: 0 | |
10 | generated_by: 'ExtUtils::MakeMaker version 6.57_05' | |
11 | license: unknown | |
12 | meta-spec: | |
13 | url: http://module-build.sourceforge.net/META-spec-v1.4.html | |
14 | version: 1.4 | |
15 | name: HTML-Strip | |
16 | no_index: | |
17 | directory: | |
18 | - t | |
19 | - inc | |
20 | requires: {} | |
21 | version: 1.08 |
0 | ||
1 | use 5.006; | |
0 | 2 | use ExtUtils::MakeMaker; |
3 | my $EUMM_VERSION = eval $ExtUtils::MakeMaker::VERSION; | |
1 | 4 | |
2 | # See lib/ExtUtils/MakeMaker.pm for details of how to influence | |
3 | # the contents of the Makefile that is written. | |
4 | 5 | WriteMakefile( |
5 | 'NAME' => 'HTML::Strip', | |
6 | 'VERSION_FROM' => 'Strip.pm', # finds $VERSION | |
7 | 'PREREQ_PM' => {}, # e.g., Module::Name => 1.1 | |
8 | ($] >= 5.005 ? ## Add these new keywords supported since 5.005 | |
9 | (ABSTRACT_FROM => 'Strip.pm', # retrieve abstract from module | |
10 | AUTHOR => 'Alex Bowley <kilinrax@cpan.org>') : ()), | |
11 | 'LIBS' => [''], # e.g., '-lm' | |
12 | 'DEFINE' => '', # e.g., '-DHAVE_SOMETHING' | |
13 | # Insert -I. if you add *.h files later: | |
14 | 'INC' => '', # e.g., '-I/usr/include/other' | |
15 | # Un-comment this if you add C files to link with later: | |
16 | 'OBJECT' => '$(O_FILES)', # link all the C files too | |
6 | 'NAME' => 'HTML::Strip', | |
7 | 'VERSION_FROM' => 'Strip.pm', | |
8 | 'PREREQ_PM' => { | |
9 | # core modules | |
10 | 'warnings' => 0, | |
11 | 'strict' => 0, | |
12 | 'Carp' => 0, | |
13 | 'Exporter' => 0, | |
14 | 'DynaLoader' => 0, | |
15 | ||
16 | # build requires | |
17 | 'Test::More' => 0, | |
18 | }, | |
19 | ( $] >= 5.005 ? ( | |
20 | ABSTRACT_FROM => 'Strip.pm', | |
21 | AUTHOR => 'Alex Bowley <kilinrax@cpan.org>' | |
22 | ) : () ), | |
23 | ( $EUMM_VERSION >= 6.46 ? ( | |
24 | LICENSE => 'perl', | |
25 | META_MERGE => { | |
26 | recommended => { | |
27 | 'HTML::Entities' => 0, | |
28 | }, | |
29 | }, | |
30 | ) : () ), | |
31 | 'LIBS' => [''], # e.g., '-lm' | |
32 | 'DEFINE' => '', # e.g., '-DHAVE_SOMETHING' | |
33 | 'INC' => '', # e.g., '-I/usr/include/other' | |
34 | 'OBJECT' => '$(O_FILES)', # link all the C files too | |
17 | 35 | ); |
0 | 0 | package HTML::Strip; |
1 | ||
2 | require DynaLoader; | |
3 | our @ISA = qw(DynaLoader); | |
4 | our $VERSION = '1.10'; | |
5 | bootstrap HTML::Strip $VERSION; | |
1 | 6 | |
2 | 7 | use 5.006; |
3 | 8 | use warnings; |
4 | 9 | use strict; |
5 | 10 | |
6 | use Carp qw( carp croak ); | |
7 | ||
8 | require Exporter; | |
9 | require DynaLoader; | |
10 | ||
11 | our @ISA = qw(Exporter DynaLoader); | |
12 | ||
13 | # Items to export into callers namespace by default. Note: do not export | |
14 | # names by default without a very good reason. Use EXPORT_OK instead. | |
15 | # Do not simply export all your public functions/methods/constants. | |
16 | ||
17 | # This allows declaration use HTML::Strip ':all'; | |
18 | # If you do not need this, moving things directly into @EXPORT or @EXPORT_OK | |
19 | # will save memory. | |
20 | our %EXPORT_TAGS = ( 'all' => [ qw( | |
21 | ) ] ); | |
22 | ||
23 | our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); | |
24 | ||
25 | our @EXPORT = qw(); | |
26 | ||
27 | our $VERSION = '1.06'; | |
28 | ||
29 | bootstrap HTML::Strip $VERSION; | |
30 | ||
31 | # Preloaded methods go here. | |
32 | ||
33 | my $_html_entities_p = eval 'require HTML::Entities'; | |
11 | use Carp; | |
12 | ||
13 | my $_html_entities_p = eval { require HTML::Entities; 1 }; | |
34 | 14 | |
35 | 15 | my %defaults = ( |
36 | striptags => [qw( title | |
37 | style | |
38 | script | |
39 | applet )], | |
40 | emit_spaces => 1, | |
41 | decode_entities => 1, | |
42 | ); | |
16 | striptags => [qw( title | |
17 | style | |
18 | script | |
19 | applet )], | |
20 | emit_spaces => 1, | |
21 | decode_entities => 1, | |
22 | filter => $_html_entities_p ? 'filter_entities' : undef, | |
23 | auto_reset => 0, | |
24 | debug => 0, | |
25 | ); | |
43 | 26 | |
44 | 27 | sub new { |
45 | my $class = shift; | |
46 | my $obj = create(); | |
47 | bless $obj, $class; | |
48 | ||
49 | my %args = (%defaults, @_); | |
50 | while( my ($key, $value) = each %args ) { | |
51 | my $method = "set_${key}"; | |
52 | if( $obj->can($method) ) { | |
53 | $obj->$method($value); | |
28 | my $class = shift; | |
29 | my $obj = create(); | |
30 | bless $obj, $class; | |
31 | ||
32 | my %args = (%defaults, @_); | |
33 | while( my ($key, $value) = each %args ) { | |
34 | my $method = "set_${key}"; | |
35 | if( $obj->can($method) ) { | |
36 | $obj->$method($value); | |
37 | } else { | |
38 | Carp::carp "Invalid setting '$key'"; | |
39 | } | |
40 | } | |
41 | return $obj; | |
42 | } | |
43 | ||
44 | sub set_striptags { | |
45 | my ($self, @tags) = @_; | |
46 | if( ref($tags[0]) eq 'ARRAY' ) { | |
47 | $self->set_striptags_ref( $tags[0] ); | |
54 | 48 | } else { |
55 | carp "Invalid setting '$key'"; | |
56 | } | |
57 | } | |
58 | return $obj; | |
59 | } | |
60 | ||
61 | sub set_striptags { | |
62 | my ($self, @tags) = @_; | |
63 | if( ref($tags[0]) eq 'ARRAY' ) { | |
64 | $self->set_striptags_ref( $tags[0] ); | |
65 | } else { | |
66 | $self->set_striptags_ref( \@tags ); | |
67 | } | |
49 | $self->set_striptags_ref( \@tags ); | |
50 | } | |
51 | } | |
52 | ||
53 | { | |
54 | # an inside-out object approach | |
55 | # for the 'filter' attribute | |
56 | my %filter_of; | |
57 | ||
58 | sub set_filter { | |
59 | my ($self, $filter) = @_; | |
60 | $filter_of{0+$self} = $filter; | |
61 | } | |
62 | ||
63 | sub filter { | |
64 | my $self = shift; | |
65 | return $filter_of{0+$self} | |
66 | } | |
67 | ||
68 | sub DESTROY { | |
69 | my $self = shift; | |
70 | delete $filter_of{0+$self}; | |
71 | } | |
72 | } | |
73 | ||
74 | # $decoded_string = $self->filter_entities( $string ) | |
75 | sub filter_entities { | |
76 | my $self = shift; | |
77 | if( $self->decode_entities ) { | |
78 | return HTML::Entities::decode($_[0]); | |
79 | } | |
80 | return $_[0]; | |
81 | } | |
82 | ||
83 | sub _do_filter { | |
84 | my $self = shift; | |
85 | my $filter = $self->filter; | |
86 | # no filter: return immediately | |
87 | return $_[0] unless defined $filter; | |
88 | ||
89 | if ( !ref $filter ) { # method name | |
90 | return $self->$filter( @_ ); | |
91 | } else { # code ref | |
92 | return $filter->( @_ ); | |
93 | } | |
68 | 94 | } |
69 | 95 | |
70 | 96 | sub parse { |
71 | my ($self, $text) = @_; | |
72 | my $stripped = $self->strip_html( $text ); | |
73 | if( $self->decode_entities && $_html_entities_p ) { | |
74 | $stripped = HTML::Entities::decode($stripped); | |
75 | } | |
76 | return $stripped; | |
97 | my ($self, $text) = @_; | |
98 | my $stripped = $self->strip_html( $text ); | |
99 | return $self->_do_filter( $stripped ); | |
77 | 100 | } |
78 | 101 | |
79 | 102 | sub eof { |
80 | my $self = shift; | |
81 | $self->reset(); | |
103 | my $self = shift; | |
104 | $self->reset(); | |
82 | 105 | } |
83 | 106 | |
84 | 107 | 1; |
135 | 158 | declaration or a comment. Within such tags, C<E<gt>> characters do not |
136 | 159 | end the tag if they appear within pairs of double dashes (e.g. C<E<lt>!-- |
137 | 160 | E<lt>a href="old.htm"E<gt>old pageE<lt>/aE<gt> --E<gt>> would be |
138 | stripped completely). | |
161 | stripped completely). Inside a comment, no parsing for quotes | |
162 | is done as well. (That means C<E<lt>!-- comment with ' quote " --E<gt>> | |
163 | are entirely stripped.) | |
139 | 164 | |
140 | 165 | =back |
141 | 166 | |
153 | 178 | next call to parse to start with the remains of said tag. |
154 | 179 | |
155 | 180 | If this is not going to be the case, be sure to call $hs->eof() |
156 | between calls to $hs->parse(). | |
181 | between calls to $hs->parse(). Alternatively, you may | |
182 | set C<auto_reset> to true on the constructor or any time | |
183 | after with C<set_auto_reset>, so that the parser will always | |
184 | operate in one-shot basis (resetting after each parsed chunk). | |
157 | 185 | |
158 | 186 | =head2 METHODS |
187 | ||
188 | =over | |
159 | 189 | |
160 | 190 | =item new() |
161 | 191 | |
200 | 230 | Takes a boolean value. If set to false, HTML::Strip will decode HTML |
201 | 231 | entities. Set to true by default. |
202 | 232 | |
233 | =item filter_entities() | |
234 | ||
235 | If HTML::Entities is available, this method behaves just | |
236 | like invoking HTML::Entities::decode_entities, except that | |
237 | it respects the current setting of 'decode_entities'. | |
238 | ||
239 | =item set_filter() | |
240 | ||
241 | Sets a filter to be applied after tags were stripped. | |
242 | It may accept the name of a method (like 'filter_entities') | |
243 | or a code ref. By default, its value is 'filter_entities' | |
244 | if HTML::Entities is available or C<undef> otherwise. | |
245 | ||
246 | =item set_auto_reset() | |
247 | ||
248 | Takes a boolean value. If set to true, C<parse> resets after | |
249 | each call (equivalent to calling C<eof>). Otherwise, the | |
250 | parser remembers its state from one call to C<parse> to | |
251 | another, until you call C<eof> explicitly. Set to false | |
252 | by default. | |
253 | ||
254 | =back | |
255 | ||
203 | 256 | =head2 LIMITATIONS |
204 | 257 | |
205 | 258 | =over 4 |
226 | 279 | HTML::Strip will only attempt decoding of HTML entities if |
227 | 280 | L<HTML::Entities> is installed. |
228 | 281 | |
282 | =back | |
283 | ||
229 | 284 | =head2 EXPORT |
230 | 285 | |
231 | 286 | None by default. |
4 | 4 | |
5 | 5 | #include "strip_html.h" |
6 | 6 | |
7 | MODULE = HTML::Strip PACKAGE = HTML::Strip | |
7 | MODULE = HTML::Strip PACKAGE = HTML::Strip | |
8 | 8 | |
9 | 9 | PROTOTYPES: ENABLE |
10 | 10 | |
38 | 38 | strip_html( stripper, raw, clean ); |
39 | 39 | RETVAL = clean; |
40 | 40 | OUTPUT: |
41 | RETVAL | |
41 | RETVAL | |
42 | 42 | CLEANUP: |
43 | 43 | Safefree( clean ); |
44 | 44 | |
87 | 87 | int n; |
88 | 88 | if( (SvROK(tagref)) && |
89 | 89 | (SvTYPE(SvRV(tagref)) == SVt_PVAV) ) { |
90 | tags = (AV *) SvRV(tagref); | |
90 | tags = (AV *) SvRV(tagref); | |
91 | 91 | } else { |
92 | 92 | XSRETURN_UNDEF; |
93 | 93 | } |
102 | 102 | char * tag = SvPV(*av_fetch(tags, n, 0), l); |
103 | 103 | add_striptag( stripper, tag ); |
104 | 104 | } |
105 | ||
106 | void | |
107 | set_auto_reset( stripper, value ) | |
108 | Stripper * stripper | |
109 | int value | |
110 | CODE: | |
111 | stripper->o_auto_reset = value; | |
112 | ||
113 | int | |
114 | auto_reset( stripper ) | |
115 | Stripper * stripper | |
116 | CODE: | |
117 | RETVAL = stripper->o_auto_reset; | |
118 | OUTPUT: | |
119 | RETVAL | |
120 | ||
121 | void | |
122 | set_debug( stripper, value ) | |
123 | Stripper * stripper | |
124 | int value | |
125 | CODE: | |
126 | stripper->o_debug = value; | |
127 | ||
128 | int | |
129 | debug( stripper ) | |
130 | Stripper * stripper | |
131 | CODE: | |
132 | RETVAL = stripper->o_debug; | |
133 | OUTPUT: | |
134 | RETVAL |
0 | ||
1 | 0 | #include <stdio.h> |
2 | 1 | #include <ctype.h> |
3 | 2 | #include <string.h> |
4 | 3 | #include "strip_html.h" |
5 | 4 | |
6 | ||
7 | 5 | void |
8 | 6 | strip_html( Stripper * stripper, const char * raw, char * output ) { |
9 | 7 | const char * p_raw = raw; |
10 | 8 | const char * raw_end = raw + strlen(raw); |
11 | 9 | char * p_output = output; |
12 | ||
10 | ||
13 | 11 | while( p_raw < raw_end ) { |
12 | if( stripper->o_debug ) { | |
13 | printf( "[DEBUG] char %c state %c %c %c tag:%5s, %c %c %c %c, %c %c %c %c:%c, ", | |
14 | *p_raw, | |
15 | (stripper->f_closing ? 'C' : ' '), | |
16 | (stripper->f_in_tag ? 'T' : ' '), | |
17 | (stripper->f_full_tagname ? 'F' : ' '), | |
18 | stripper->tagname, | |
19 | (stripper->f_just_seen_tag ? 'J' : ' '), | |
20 | (stripper->f_outputted_space ? 'S' : ' '), | |
21 | (stripper->f_lastchar_slash ? '/' : ' '), | |
22 | (stripper->f_lastchar_minus ? '-' : ' '), | |
23 | (stripper->f_in_decl ? 'D' : ' '), | |
24 | (stripper->f_in_comment ? 'C' : ' '), | |
25 | (stripper->f_in_striptag ? 'X' : ' '), | |
26 | (stripper->f_in_quote ? 'Q' : ' '), | |
27 | (stripper->quote ? stripper->quote : ' ') | |
28 | ); | |
29 | } | |
14 | 30 | if( stripper->f_in_tag ) { |
15 | 31 | /* inside a tag */ |
16 | 32 | /* check if we know either the tagname, or that we're in a declaration */ |
22 | 38 | /* then check if the first character is a '/', in which case, this is a closing tag */ |
23 | 39 | else if( stripper->p_tagname == stripper->tagname && *p_raw == '/' ) { |
24 | 40 | stripper->f_closing = 1; |
25 | } else { | |
41 | } | |
42 | /* if the first character wasn't a '/', and we're in a stripped block, | |
43 | * assume this is a mathematical operator and reset */ | |
44 | else if( !stripper->f_closing && stripper->f_in_striptag && stripper->p_tagname == stripper->tagname && *p_raw != '/' ) { | |
45 | stripper->f_in_tag = 0; | |
46 | stripper->f_closing = 0; | |
47 | /* we only care about closing tags within a stripped tags block (e.g. scripts) */ | |
48 | } else if( !stripper->f_in_striptag || stripper->f_closing ) { | |
26 | 49 | /* if we don't have the full tag name yet, add current character unless it's whitespace, a '/', or a '>'; |
27 | 50 | otherwise null pad the string and set the full tagname flag, and check the tagname against stripped ones. |
28 | 51 | also sanity check we haven't reached the array bounds, and truncate the tagname here if we have */ |
40 | 63 | /* if we're outside a stripped tag block, check tagname against stripped tag list */ |
41 | 64 | } else if( !stripper->f_in_striptag && !stripper->f_closing ) { |
42 | 65 | int i; |
43 | for( i = 0; i <= stripper->numstriptags; i++ ) { | |
66 | for( i = 0; i < stripper->numstriptags; i++ ) { | |
44 | 67 | if( strcasecmp( stripper->tagname, stripper->o_striptags[i] ) == 0 ) { |
45 | 68 | stripper->f_in_striptag = 1; |
46 | 69 | strcpy( stripper->striptag, stripper->tagname ); |
60 | 83 | } |
61 | 84 | } else { |
62 | 85 | /* not in a quote */ |
63 | /* check for quote characters */ | |
64 | if( *p_raw == '\'' || *p_raw == '\"' ) { | |
86 | /* check for quote characters, but not in a comment */ | |
87 | if( !stripper->f_in_comment && | |
88 | ( *p_raw == '\'' || *p_raw == '\"' ) ) { | |
65 | 89 | stripper->f_in_quote = 1; |
66 | 90 | stripper->quote = *p_raw; |
67 | 91 | /* reset lastchar_* flags in case we have something perverse like '-"' or '/"' */ |
111 | 135 | /* output a space in place of tags we have previously parsed, |
112 | 136 | and set a flag so we only do this once for every group of tags. |
113 | 137 | done here to prevent unnecessary trailing spaces */ |
114 | if( isspace(*p_raw) ) { | |
138 | if( !isspace(*p_raw) && | |
115 | 139 | /* don't output a space if this character is one anyway */ |
140 | !stripper->f_outputted_space && | |
141 | stripper->f_just_seen_tag ) { | |
142 | if( stripper->o_debug ) { | |
143 | printf("SPACE "); | |
144 | } | |
145 | *p_output++ = ' '; | |
116 | 146 | stripper->f_outputted_space = 1; |
117 | } else { | |
118 | if( !stripper->f_outputted_space && | |
119 | stripper->f_just_seen_tag ) { | |
120 | *p_output++ = ' '; | |
121 | stripper->f_outputted_space = 1; | |
122 | } else { | |
123 | /* this character must not be a space */ | |
124 | stripper->f_outputted_space = 0; | |
125 | } | |
126 | 147 | } |
148 | } | |
149 | if( stripper->o_debug ) { | |
150 | printf("CHAR %c", *p_raw); | |
127 | 151 | } |
128 | 152 | *p_output++ = *p_raw; |
129 | 153 | /* reset 'just seen tag' flag */ |
130 | 154 | stripper->f_just_seen_tag = 0; |
155 | /* reset 'outputted space' flag if character is not one */ | |
156 | if (!isspace(*p_raw)) { | |
157 | stripper->f_outputted_space = 0; | |
158 | } else { | |
159 | stripper->f_outputted_space = 1; | |
160 | } | |
131 | 161 | } |
132 | 162 | } |
133 | 163 | } /* in tag check */ |
134 | 164 | p_raw++; |
165 | if( stripper->o_debug ) { | |
166 | printf("\n"); | |
167 | } | |
135 | 168 | } /* while loop */ |
136 | 169 | |
137 | 170 | *p_output = 0; |
171 | ||
172 | if (stripper->o_auto_reset) { | |
173 | reset( stripper ); | |
174 | } | |
138 | 175 | } |
139 | 176 | |
140 | 177 | void |
146 | 183 | /* hack to stop a space being output on strings starting with a tag */ |
147 | 184 | stripper->f_outputted_space = 1; |
148 | 185 | stripper->f_just_seen_tag = 0; |
149 | ||
186 | ||
150 | 187 | stripper->f_in_quote = 0; |
151 | 188 | |
152 | 189 | stripper->f_in_decl = 0; |
153 | 190 | stripper->f_in_comment = 0; |
154 | 191 | stripper->f_lastchar_minus = 0; |
155 | ||
192 | ||
156 | 193 | stripper->f_in_striptag = 0; |
194 | ||
157 | 195 | } |
158 | 196 | |
159 | 197 | void |
171 | 209 | } |
172 | 210 | } |
173 | 211 | |
212 | #ifdef _MSC_VER | |
213 | #define strcasecmp(a,b) stricmp(a,b) | |
214 | #endif | |
174 | 215 | |
175 | 216 | void |
176 | 217 | check_end( Stripper * stripper, char end ) { |
179 | 220 | stripper->f_lastchar_slash = 1; |
180 | 221 | } else { |
181 | 222 | /* if the current character is a '>', then the tag has ended */ |
182 | if( end == '>' ) { | |
223 | /* slight hack to deal with mathematical characters in script tags: | |
224 | * if we're in a stripped block, and this is a closing tag, spaces | |
225 | * will also end the tag, since we only want it for comparison with | |
226 | * the opening one */ | |
227 | if( (end == '>') || | |
228 | (stripper->f_in_striptag && stripper->f_closing && isspace(end)) ) { | |
183 | 229 | stripper->f_in_quote = 0; |
184 | 230 | stripper->f_in_comment = 0; |
185 | 231 | stripper->f_in_decl = 0; |
186 | 232 | stripper->f_in_tag = 0; |
233 | stripper->f_closing = 0; | |
187 | 234 | /* Do not start a stripped tag block if the tag is a closed one, e.g. '<script src="foo" />' */ |
188 | 235 | if( stripper->f_lastchar_slash && |
189 | 236 | (strcasecmp( stripper->striptag, stripper->tagname ) == 0) ) { |
26 | 26 | int numstriptags; |
27 | 27 | int o_emit_spaces; |
28 | 28 | int o_decode_entities; |
29 | ||
30 | int o_auto_reset; | |
31 | ||
32 | int o_debug; | |
29 | 33 | } Stripper; |
30 | 34 | |
31 | 35 | void strip_html( Stripper * stripper, const char * raw, char * clean ); |
0 | ||
1 | use Test::More tests => 5; | |
2 | ||
3 | BEGIN { use_ok 'HTML::Strip' } | |
4 | ||
5 | { | |
6 | my $hs = HTML::Strip->new; # auto_reset off by default | |
7 | my $o = $hs->parse( "<html>\nTitle\n<script>a+b\n" ); | |
8 | is( $o, "\nTitle\n" ); | |
9 | my $o2 = $hs->parse( "c+d\n</script>\nEnd\n</html>" ); | |
10 | is( $o2, "\nEnd\n" ); | |
11 | } | |
12 | ||
13 | { | |
14 | my $hs = HTML::Strip->new( auto_reset => 1 ); # auto_reset on | |
15 | my $o = $hs->parse( "<html>\nTitle\n<script>a+b\n" ); | |
16 | is( $o, "\nTitle\n" ); | |
17 | my $o2 = $hs->parse( "c+d\n</script>\nEnd\n</html>" ); | |
18 | is( $o2, "c+d\n\nEnd\n" ); | |
19 | } |
0 | use Test::More tests => 19; | |
1 | ||
2 | BEGIN { use_ok 'HTML::Strip' } | |
3 | ||
4 | { | |
5 | my $hs = new HTML::Strip; | |
6 | ||
7 | is( $hs->parse( 'test' ), 'test', 'works with plain text' ); | |
8 | $hs->eof; | |
9 | ||
10 | is( $hs->parse( '<em>test</em>' ), 'test', 'works with <em>|</em> tags' ); | |
11 | $hs->eof; | |
12 | ||
13 | is( $hs->parse( 'foo<br>bar' ), 'foo bar', 'works with <br> tag' ); | |
14 | $hs->eof; | |
15 | ||
16 | is( $hs->parse( '<p align="center">test</p>' ), 'test', 'works with tags with attributes' ); | |
17 | $hs->eof; | |
18 | ||
19 | is( $hs->parse( '<p align="center>test</p>' ), '', '"works" with non-terminated quotes' ); | |
20 | $hs->eof; | |
21 | ||
22 | is( $hs->parse( '<foo>bar' ), 'bar', 'strips <foo> tags' ); | |
23 | is( $hs->parse( '</foo>baz' ), ' baz', 'strips </foo> tags' ); | |
24 | $hs->eof; | |
25 | ||
26 | is( $hs->parse( '<!-- <p>foo</p> bar -->baz' ), 'baz', 'strip comments' ); | |
27 | $hs->eof; | |
28 | ||
29 | is( $hs->parse( '<img src="foo.gif" alt="a > b">bar' ), 'bar', 'works with quote attributes which contain >' ); | |
30 | $hs->eof; | |
31 | ||
32 | is( $hs->parse( '<script> if (a>b && a<c) { ... } </script>bar' ), 'bar', '<script> tag and content are stripped' ); | |
33 | $hs->eof; | |
34 | ||
35 | is( $hs->parse( '<# just data #>bar' ), 'bar', 'weird tags get stripped' ); | |
36 | $hs->eof; | |
37 | ||
38 | TODO: { | |
39 | local $TODO = "fix CDATA handling"; | |
40 | is( $hs->parse( '<![INCLUDE CDATA [ >>>>>>>>>>>> ]]>bar' ), 'bar', 'character data gets stripped' ); | |
41 | $hs->eof; | |
42 | } | |
43 | ||
44 | is( $hs->parse( '<script>foo</script>bar' ), 'bar', '<script> nodes are stripped' ); | |
45 | $hs->eof; | |
46 | ||
47 | my $has_html_entities = eval { require HTML::Entities; 1 }; | |
48 | SKIP: { | |
49 | skip 'HTML::Entities not available', 2 unless $has_html_entities; | |
50 | ||
51 | is( $hs->parse( '<foo>' ), '<foo>', 'numeric HTML entities are decoded' ); | |
52 | $hs->eof; | |
53 | is( $hs->parse( '<foo>' ), '<foo>', 'HTML entities are decoded' ); | |
54 | $hs->eof; | |
55 | } | |
56 | ||
57 | $hs->set_decode_entities(0); | |
58 | is( $hs->parse( '<foo>' ), '<foo>', 'entities decoding off works' ); | |
59 | $hs->eof; | |
60 | ||
61 | is( $hs->parse( '<foo>' ), '<foo>', 'entities decoding off works' ); | |
62 | $hs->eof; | |
63 | ||
64 | is( $hs->parse( '<script>foo</script>bar' ), 'bar', '"script" is a default strip_tag' ); | |
65 | $hs->eof; | |
66 | } |
0 | ||
1 | # http://rt.cpan.org/Public/Bug/Display.html?id=32355 | |
2 | ||
3 | use Test::More tests => 7; | |
4 | ||
5 | BEGIN { use_ok 'HTML::Strip' } | |
6 | ||
7 | # stripping declarations | |
8 | { | |
9 | my $hs = HTML::Strip->new(); | |
10 | is( $hs->parse( q{<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"><html>Text</html>} ), | |
11 | "Text", 'decls are stripped' ); | |
12 | $hs->eof; | |
13 | } | |
14 | ||
15 | # stripping comments | |
16 | { | |
17 | my $hs = HTML::Strip->new(); | |
18 | is( $hs->parse( q{<html><!-- a comment to be stripped -->Hello World!</html>} ), | |
19 | "Hello World!", "comments are stripped" ); | |
20 | $hs->eof; | |
21 | ||
22 | is( $hs->parse( q{<html><!-- comment with a ' apos -->Hello World!</html>} ), | |
23 | "Hello World!", q{comments may contain '} ); | |
24 | $hs->eof; | |
25 | ||
26 | is( $hs->parse( q{<html><!-- comment with a " quote -->Hello World!</html>} ), | |
27 | "Hello World!", q{comments may contain "} ); | |
28 | $hs->eof; | |
29 | ||
30 | is( $hs->parse( q{<html><!-- comment -- "quote" >Hello World!</html>} ), | |
31 | "Hello World!", "weird decls are stripped" ); | |
32 | $hs->eof; | |
33 | ||
34 | is( $hs->parse( "a<>b" ), | |
35 | "a b", 'edge case with <> ok' ); | |
36 | ||
37 | } |
0 | use Test::More; | |
1 | ||
2 | BEGIN { use_ok 'HTML::Strip' } | |
3 | ||
4 | # test for RT#21008 | |
5 | ||
6 | # stripping comments | |
7 | { | |
8 | my $hs = HTML::Strip->new(); | |
9 | is( $hs->parse( "a<>b" ), "a b", 'edge case with <> ok' ); | |
10 | $hs->eof; | |
11 | is( $hs->parse( "a<>b c<>d" ), "a b c d", 'edge case with <>s ok' ); | |
12 | $hs->eof; | |
13 | is( $hs->parse( "From: <>\n\na. Title: some text\n\nb. etc\n" ), "From: \n\na. Title: some text\n\nb. etc\n", 'test case' ); | |
14 | is( $hs->parse( "From: <>\n\na. Title: some text\n\nb. etc\n" ), "From: \n\na. Title: some text\n\nb. etc\n", 'test case' ); | |
15 | $hs->eof; | |
16 | is( $hs->parse( q{this is an "example" with 'quoted' parts that should not be stripped} ), q{this is an "example" with 'quoted' parts that should not be stripped} ); | |
17 | } | |
18 | ||
19 | done_testing; |
0 | use Test::More tests => 3; | |
1 | ||
2 | BEGIN { use_ok 'HTML::Strip' } | |
3 | ||
4 | { | |
5 | my $hs = HTML::Strip->new( filter => undef ); | |
6 | ok( $hs->parse( '<html> </html>' ), ' ' ); | |
7 | $hs->eof; | |
8 | ||
9 | } | |
10 | ||
11 | { | |
12 | my $filter = sub { my $s = shift; $s =~ s/\s/ /g;; $s }; | |
13 | my $hs = HTML::Strip->new( filter => $filter ); | |
14 | ok( $hs->parse( "<html>title\ntext\ntext</html>" ), 'title text text' ); | |
15 | $hs->eof; | |
16 | ||
17 | } |
0 | use Test::More tests => 3; | |
1 | ||
2 | BEGIN { use_ok 'HTML::Strip' } | |
3 | ||
4 | # test for RT#35345 | |
5 | { | |
6 | my $hs = HTML::Strip->new(); | |
7 | is( $hs->parse( <<EOF ), "\nHello\n", "mathematical comparisons in strip tags bug RT#35345" ); | |
8 | <script> | |
9 | function shovelerMain (detectBuyBox) { | |
10 | for (var i = 0; i < Shoveler.Instances.length; i++) { | |
11 | ... | |
12 | </script> | |
13 | <h1>Hello</h1> | |
14 | EOF | |
15 | $hs->eof; | |
16 | } | |
17 | ||
18 | # test for RT#99207 | |
19 | { | |
20 | my $hs = HTML::Strip->new(); | |
21 | is( $hs->parse( <<EOF ), "\nhallo\n", "mathematical comparisons in strip tags bug RT#99207" ); | |
22 | <script type="text/javascript"> | |
23 | document.write('<scr'+'ipt src="//www3.smartadserver.com/call/pubj/' + sas_config.pageid + '/' + formatid + '/' + sas_config.master + '/' + sas_config.tmstp + '/' + encodeURIComponent(target) + '?"></scr'+'ipt>'); | |
24 | </script> | |
25 | <span>hallo</span> | |
26 | EOF | |
27 | $hs->eof; | |
28 | } | |
29 |
0 | use strict; | |
1 | ||
2 | # test for RT#94713 | |
3 | use Test::More tests => 1; | |
4 | ||
5 | my $INC = join ' ', map { "-I$_" } @INC; | |
6 | ||
7 | SKIP: { | |
8 | skip "test fails on windows", 1 if $^O eq 'MSWin32'; | |
9 | is(`MALLOC_OPTIONS=Z $^X $INC -MHTML::Strip -e 'print HTML::Strip->new->parse(q[<li>abc < 0.5 km</li><li>xyz</li>])'`, q[abc xyz]); | |
10 | } | |
11 |
0 | use Test::More tests => 6; | |
1 | ||
2 | BEGIN { use_ok 'HTML::Strip' } | |
3 | ||
4 | { | |
5 | # set_striptags( \@ARRAY ) | |
6 | my $hs = HTML::Strip->new; | |
7 | $hs->set_striptags( [ 'foo' ] ); | |
8 | ||
9 | is( $hs->parse( '<script>foo</script>bar' ), 'foo bar', 'set_striptags redefinition works' ); | |
10 | $hs->eof; | |
11 | ||
12 | is( $hs->parse( '<foo>foo</foo>bar' ), 'bar', 'set_striptags redefinition works' ); | |
13 | $hs->eof; | |
14 | } | |
15 | ||
16 | { | |
17 | # set_striptags( LIST ) | |
18 | my @striptags = qw(baz quux); | |
19 | my $hs = HTML::Strip->new; | |
20 | $hs->set_striptags( @striptags ); | |
21 | ||
22 | is( $hs->parse( '<baz>fumble</baz>bar<quux>foo</quux>' ), 'bar', 'stripping user-defined tags ok' ); | |
23 | $hs->eof; | |
24 | ||
25 | is( $hs->parse( '<baz>fumble<quux/>foo</baz>bar' ), 'bar', 'stripping user-defined tags ok' ); | |
26 | $hs->eof; | |
27 | ||
28 | is( $hs->parse( '<foo> </foo> <bar> baz </bar>' ), ' baz ', 'stripping user-defined tags ok' ); | |
29 | $hs->eof; | |
30 | } |
0 | use Test::More tests => 2; | |
1 | ||
2 | BEGIN { use_ok 'HTML::Strip' } | |
3 | ||
4 | # test for RT#19036 | |
5 | { | |
6 | my $hs = HTML::Strip->new(); | |
7 | is( $hs->parse( '<tr><td>01 May 2006</td><td>0</td><td>10</td></tr>' ), '01 May 2006 0 10', "whitespace single character bug" ); | |
8 | $hs->eof; | |
9 | } |
0 | # Before `make install' is performed this script should be runnable with | |
1 | # `make test'. After `make install' it should work as `perl test.pl' | |
2 | ||
3 | ######################### | |
4 | ||
5 | # change 'tests => 1' to 'tests => last_test_to_print'; | |
6 | ||
7 | use Test; | |
8 | BEGIN { plan tests => 17 }; | |
9 | use HTML::Strip; | |
10 | ok(1); # If we made it this far, we're ok. | |
11 | ||
12 | ######################### | |
13 | ||
14 | # Insert your test code below, the Test module is use()ed here so read | |
15 | # its man page ( perldoc Test ) for help writing this test script. | |
16 | ||
17 | my $hs = new HTML::Strip; | |
18 | ||
19 | ok( $hs->parse( 'test' ), 'test' ); | |
20 | $hs->eof; | |
21 | ||
22 | ok( $hs->parse( '<em>test</em>' ), 'test' ); | |
23 | $hs->eof; | |
24 | ||
25 | ok( $hs->parse( 'foo<br>bar' ), 'foo bar' ); | |
26 | $hs->eof; | |
27 | ||
28 | ok( $hs->parse( '<p align="center">test</p>' ), 'test' ); | |
29 | $hs->eof; | |
30 | ||
31 | ok( $hs->parse( '<p align="center>test</p>' ), '' ); | |
32 | $hs->eof; | |
33 | ||
34 | ok( $hs->parse( '<foo>bar' ), 'bar' ); | |
35 | ok( $hs->parse( '</foo>baz' ), ' baz' ); | |
36 | $hs->eof; | |
37 | ||
38 | ok( $hs->parse( '<!-- <p>foo</p> bar -->baz' ), 'baz' ); | |
39 | $hs->eof; | |
40 | ||
41 | ok( $hs->parse( '<img src="foo.gif" alt="a > b">bar' ), 'bar' ); | |
42 | $hs->eof; | |
43 | ||
44 | ok( $hs->parse( '<script>if (a<b && a>c)</script>bar' ), 'bar' ); | |
45 | $hs->eof; | |
46 | ||
47 | ok( $hs->parse( '<# just data #>bar' ), 'bar' ); | |
48 | $hs->eof; | |
49 | ||
50 | #ok( $hs->parse( '<![INCLUDE CDATA [ >>>>>>>>>>>> ]]>bar' ), 'bar' ); | |
51 | #$hs->eof; | |
52 | ||
53 | ok( $hs->parse( '<script>foo</script>bar' ), 'bar' ); | |
54 | $hs->eof; | |
55 | ||
56 | my $html_entities_p = eval 'require HTML::Entities' ? '' : 'HTML::Entities not available'; | |
57 | skip( $html_entities_p, $hs->parse( '<foo>' ), '<foo>' ); | |
58 | $hs->eof; | |
59 | skip( $html_entities_p, $hs->parse( '<foo>' ), '<foo>' ); | |
60 | $hs->eof; | |
61 | $hs->set_decode_entities(0); | |
62 | skip( $html_entities_p, $hs->parse( '<foo>' ), '<foo>' ); | |
63 | $hs->eof; | |
64 | skip( $html_entities_p, $hs->parse( '<foo>' ), '<foo>' ); | |
65 | $hs->eof; | |
66 | ||
67 | ||
68 | my $hs2 = new HTML::Strip; | |
69 | $hs2->set_striptags( [ 'foo' ] ); | |
70 | ||
71 | ok( $hs2->parse( '<script>foo</script>bar' ), 'foo bar' ); | |
72 | $hs2->eof; | |
73 | ||
74 | ok( $hs2->parse( '<foo>foo</foo>bar' ), 'bar' ); | |
75 | $hs2->eof; | |
76 | ||
77 | ok( $hs->parse( '<script>foo</script>bar' ), 'bar' ); | |
78 | $hs->eof; | |
79 | ||
80 | my @striptags = qw(baz quux); | |
81 | $hs->set_striptags( @striptags ); | |
82 | ||
83 | ok( $hs->parse( '<baz>fumble</baz>bar<quux>foo</quux>' ), 'bar' ); | |
84 | $hs->eof; | |
85 | ||
86 | ok( $hs->parse( '<baz>fumble<quux/>foo</baz>bar' ), 'bar' ); | |
87 | $hs->eof; | |
88 | ||
89 | ok( $hs->parse( '<foo> </foo> <bar> baz </bar>' ), ' baz ' ); | |
90 | $hs->eof; |