Codebase list libhtml-strip-perl / b0792ed
Imported Upstream version 1.10 gregor herrmann 9 years ago
18 changed file(s) with 561 addition(s) and 196 deletion(s). Raw diff Collapse all Expand all
00 Revision history for Perl extension HTML::Strip.
1
2 1.10 Tue Sep 30 14:34:47 UTC 2014
3 - Fix for RT#99207 (script mathematical symbol bug).
4
5 1.09 Tue Sep 30 10:39:47 UTC 2014
6 - offbyone.t disabled under Windows (RT#99219)
7
8 1.08 Fri Sep 26 15:02:37 UTC 2014
9 - system perl used in offbyone.t (RT#99151)
10
11 1.07 Tue Sep 23 14:44:08 UTC 2014
12 - fix to bug RT#19036 - tags not replaced with spaces when only a single
13 character is between the tags
14 - fix to bug RT#35345 - mathematical conparisons within <script> tags
15 misunderstood
16 (patches contributed by Adriano Ferreira)
17 - Exporter was never needed
18 - Allow other filtering operations than just decoding of HTML entities
19 - Modernised test suite
20 - Adds 'auto_reset' attribute, which allows automagic use of $hs->eof
21 - fixes quotes in html comments (RT#32355)
22 (patch contributed by Reini Urban)
23 - MSVC doesnt define strcasecmp, use stricmp instead
24 (patch contributed by Damyan Ivanov)
25 - fixes POD errors
126
227 1.06 Fri Feb 10 11:18:35 2006
328 - documented 'set_decode_entities' method
00 Changes
11 Makefile.PL
22 MANIFEST
3 META.yml
34 README
45 Strip.pm
56 Strip.xs
67 strip_html.h
78 strip_html.c
89 typemap
9 test.pl
10 t/basic.t
11 t/auto-reset.t
12 t/comment.t
13 t/edge-case.t
14 t/filter.t
15 t/mathematical-comparisons.t
16 t/offbyone.t
17 t/striptags.t
18 t/whitespace-single-char.t
0 ---
1 abstract: 'Perl extension for stripping HTML markup from text.'
2 author:
3 - 'Alex Bowley <kilinrax@cpan.org>'
4 build_requires:
5 ExtUtils::MakeMaker: 0
6 configure_requires:
7 ExtUtils::MakeMaker: 0
8 distribution_type: module
9 dynamic_config: 0
10 generated_by: 'ExtUtils::MakeMaker version 6.57_05'
11 license: unknown
12 meta-spec:
13 url: http://module-build.sourceforge.net/META-spec-v1.4.html
14 version: 1.4
15 name: HTML-Strip
16 no_index:
17 directory:
18 - t
19 - inc
20 requires: {}
21 version: 1.08
0
1 use 5.006;
02 use ExtUtils::MakeMaker;
3 my $EUMM_VERSION = eval $ExtUtils::MakeMaker::VERSION;
14
2 # See lib/ExtUtils/MakeMaker.pm for details of how to influence
3 # the contents of the Makefile that is written.
45 WriteMakefile(
5 'NAME' => 'HTML::Strip',
6 'VERSION_FROM' => 'Strip.pm', # finds $VERSION
7 'PREREQ_PM' => {}, # e.g., Module::Name => 1.1
8 ($] >= 5.005 ? ## Add these new keywords supported since 5.005
9 (ABSTRACT_FROM => 'Strip.pm', # retrieve abstract from module
10 AUTHOR => 'Alex Bowley <kilinrax@cpan.org>') : ()),
11 'LIBS' => [''], # e.g., '-lm'
12 'DEFINE' => '', # e.g., '-DHAVE_SOMETHING'
13 # Insert -I. if you add *.h files later:
14 'INC' => '', # e.g., '-I/usr/include/other'
15 # Un-comment this if you add C files to link with later:
16 'OBJECT' => '$(O_FILES)', # link all the C files too
6 'NAME' => 'HTML::Strip',
7 'VERSION_FROM' => 'Strip.pm',
8 'PREREQ_PM' => {
9 # core modules
10 'warnings' => 0,
11 'strict' => 0,
12 'Carp' => 0,
13 'Exporter' => 0,
14 'DynaLoader' => 0,
15
16 # build requires
17 'Test::More' => 0,
18 },
19 ( $] >= 5.005 ? (
20 ABSTRACT_FROM => 'Strip.pm',
21 AUTHOR => 'Alex Bowley <kilinrax@cpan.org>'
22 ) : () ),
23 ( $EUMM_VERSION >= 6.46 ? (
24 LICENSE => 'perl',
25 META_MERGE => {
26 recommended => {
27 'HTML::Entities' => 0,
28 },
29 },
30 ) : () ),
31 'LIBS' => [''], # e.g., '-lm'
32 'DEFINE' => '', # e.g., '-DHAVE_SOMETHING'
33 'INC' => '', # e.g., '-I/usr/include/other'
34 'OBJECT' => '$(O_FILES)', # link all the C files too
1735 );
00 package HTML::Strip;
1
2 require DynaLoader;
3 our @ISA = qw(DynaLoader);
4 our $VERSION = '1.10';
5 bootstrap HTML::Strip $VERSION;
16
27 use 5.006;
38 use warnings;
49 use strict;
510
6 use Carp qw( carp croak );
7
8 require Exporter;
9 require DynaLoader;
10
11 our @ISA = qw(Exporter DynaLoader);
12
13 # Items to export into callers namespace by default. Note: do not export
14 # names by default without a very good reason. Use EXPORT_OK instead.
15 # Do not simply export all your public functions/methods/constants.
16
17 # This allows declaration use HTML::Strip ':all';
18 # If you do not need this, moving things directly into @EXPORT or @EXPORT_OK
19 # will save memory.
20 our %EXPORT_TAGS = ( 'all' => [ qw(
21 ) ] );
22
23 our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
24
25 our @EXPORT = qw();
26
27 our $VERSION = '1.06';
28
29 bootstrap HTML::Strip $VERSION;
30
31 # Preloaded methods go here.
32
33 my $_html_entities_p = eval 'require HTML::Entities';
11 use Carp;
12
13 my $_html_entities_p = eval { require HTML::Entities; 1 };
3414
3515 my %defaults = (
36 striptags => [qw( title
37 style
38 script
39 applet )],
40 emit_spaces => 1,
41 decode_entities => 1,
42 );
16 striptags => [qw( title
17 style
18 script
19 applet )],
20 emit_spaces => 1,
21 decode_entities => 1,
22 filter => $_html_entities_p ? 'filter_entities' : undef,
23 auto_reset => 0,
24 debug => 0,
25 );
4326
4427 sub new {
45 my $class = shift;
46 my $obj = create();
47 bless $obj, $class;
48
49 my %args = (%defaults, @_);
50 while( my ($key, $value) = each %args ) {
51 my $method = "set_${key}";
52 if( $obj->can($method) ) {
53 $obj->$method($value);
28 my $class = shift;
29 my $obj = create();
30 bless $obj, $class;
31
32 my %args = (%defaults, @_);
33 while( my ($key, $value) = each %args ) {
34 my $method = "set_${key}";
35 if( $obj->can($method) ) {
36 $obj->$method($value);
37 } else {
38 Carp::carp "Invalid setting '$key'";
39 }
40 }
41 return $obj;
42 }
43
44 sub set_striptags {
45 my ($self, @tags) = @_;
46 if( ref($tags[0]) eq 'ARRAY' ) {
47 $self->set_striptags_ref( $tags[0] );
5448 } else {
55 carp "Invalid setting '$key'";
56 }
57 }
58 return $obj;
59 }
60
61 sub set_striptags {
62 my ($self, @tags) = @_;
63 if( ref($tags[0]) eq 'ARRAY' ) {
64 $self->set_striptags_ref( $tags[0] );
65 } else {
66 $self->set_striptags_ref( \@tags );
67 }
49 $self->set_striptags_ref( \@tags );
50 }
51 }
52
53 {
54 # an inside-out object approach
55 # for the 'filter' attribute
56 my %filter_of;
57
58 sub set_filter {
59 my ($self, $filter) = @_;
60 $filter_of{0+$self} = $filter;
61 }
62
63 sub filter {
64 my $self = shift;
65 return $filter_of{0+$self}
66 }
67
68 sub DESTROY {
69 my $self = shift;
70 delete $filter_of{0+$self};
71 }
72 }
73
74 # $decoded_string = $self->filter_entities( $string )
75 sub filter_entities {
76 my $self = shift;
77 if( $self->decode_entities ) {
78 return HTML::Entities::decode($_[0]);
79 }
80 return $_[0];
81 }
82
83 sub _do_filter {
84 my $self = shift;
85 my $filter = $self->filter;
86 # no filter: return immediately
87 return $_[0] unless defined $filter;
88
89 if ( !ref $filter ) { # method name
90 return $self->$filter( @_ );
91 } else { # code ref
92 return $filter->( @_ );
93 }
6894 }
6995
7096 sub parse {
71 my ($self, $text) = @_;
72 my $stripped = $self->strip_html( $text );
73 if( $self->decode_entities && $_html_entities_p ) {
74 $stripped = HTML::Entities::decode($stripped);
75 }
76 return $stripped;
97 my ($self, $text) = @_;
98 my $stripped = $self->strip_html( $text );
99 return $self->_do_filter( $stripped );
77100 }
78101
79102 sub eof {
80 my $self = shift;
81 $self->reset();
103 my $self = shift;
104 $self->reset();
82105 }
83106
84107 1;
135158 declaration or a comment. Within such tags, C<E<gt>> characters do not
136159 end the tag if they appear within pairs of double dashes (e.g. C<E<lt>!--
137160 E<lt>a href="old.htm"E<gt>old pageE<lt>/aE<gt> --E<gt>> would be
138 stripped completely).
161 stripped completely). Inside a comment, no parsing for quotes
162 is done as well. (That means C<E<lt>!-- comment with ' quote " --E<gt>>
163 are entirely stripped.)
139164
140165 =back
141166
153178 next call to parse to start with the remains of said tag.
154179
155180 If this is not going to be the case, be sure to call $hs->eof()
156 between calls to $hs->parse().
181 between calls to $hs->parse(). Alternatively, you may
182 set C<auto_reset> to true on the constructor or any time
183 after with C<set_auto_reset>, so that the parser will always
184 operate in one-shot basis (resetting after each parsed chunk).
157185
158186 =head2 METHODS
187
188 =over
159189
160190 =item new()
161191
200230 Takes a boolean value. If set to false, HTML::Strip will decode HTML
201231 entities. Set to true by default.
202232
233 =item filter_entities()
234
235 If HTML::Entities is available, this method behaves just
236 like invoking HTML::Entities::decode_entities, except that
237 it respects the current setting of 'decode_entities'.
238
239 =item set_filter()
240
241 Sets a filter to be applied after tags were stripped.
242 It may accept the name of a method (like 'filter_entities')
243 or a code ref. By default, its value is 'filter_entities'
244 if HTML::Entities is available or C<undef> otherwise.
245
246 =item set_auto_reset()
247
248 Takes a boolean value. If set to true, C<parse> resets after
249 each call (equivalent to calling C<eof>). Otherwise, the
250 parser remembers its state from one call to C<parse> to
251 another, until you call C<eof> explicitly. Set to false
252 by default.
253
254 =back
255
203256 =head2 LIMITATIONS
204257
205258 =over 4
226279 HTML::Strip will only attempt decoding of HTML entities if
227280 L<HTML::Entities> is installed.
228281
282 =back
283
229284 =head2 EXPORT
230285
231286 None by default.
44
55 #include "strip_html.h"
66
7 MODULE = HTML::Strip PACKAGE = HTML::Strip
7 MODULE = HTML::Strip PACKAGE = HTML::Strip
88
99 PROTOTYPES: ENABLE
1010
3838 strip_html( stripper, raw, clean );
3939 RETVAL = clean;
4040 OUTPUT:
41 RETVAL
41 RETVAL
4242 CLEANUP:
4343 Safefree( clean );
4444
8787 int n;
8888 if( (SvROK(tagref)) &&
8989 (SvTYPE(SvRV(tagref)) == SVt_PVAV) ) {
90 tags = (AV *) SvRV(tagref);
90 tags = (AV *) SvRV(tagref);
9191 } else {
9292 XSRETURN_UNDEF;
9393 }
102102 char * tag = SvPV(*av_fetch(tags, n, 0), l);
103103 add_striptag( stripper, tag );
104104 }
105
106 void
107 set_auto_reset( stripper, value )
108 Stripper * stripper
109 int value
110 CODE:
111 stripper->o_auto_reset = value;
112
113 int
114 auto_reset( stripper )
115 Stripper * stripper
116 CODE:
117 RETVAL = stripper->o_auto_reset;
118 OUTPUT:
119 RETVAL
120
121 void
122 set_debug( stripper, value )
123 Stripper * stripper
124 int value
125 CODE:
126 stripper->o_debug = value;
127
128 int
129 debug( stripper )
130 Stripper * stripper
131 CODE:
132 RETVAL = stripper->o_debug;
133 OUTPUT:
134 RETVAL
0
10 #include <stdio.h>
21 #include <ctype.h>
32 #include <string.h>
43 #include "strip_html.h"
54
6
75 void
86 strip_html( Stripper * stripper, const char * raw, char * output ) {
97 const char * p_raw = raw;
108 const char * raw_end = raw + strlen(raw);
119 char * p_output = output;
12
10
1311 while( p_raw < raw_end ) {
12 if( stripper->o_debug ) {
13 printf( "[DEBUG] char %c state %c %c %c tag:%5s, %c %c %c %c, %c %c %c %c:%c, ",
14 *p_raw,
15 (stripper->f_closing ? 'C' : ' '),
16 (stripper->f_in_tag ? 'T' : ' '),
17 (stripper->f_full_tagname ? 'F' : ' '),
18 stripper->tagname,
19 (stripper->f_just_seen_tag ? 'J' : ' '),
20 (stripper->f_outputted_space ? 'S' : ' '),
21 (stripper->f_lastchar_slash ? '/' : ' '),
22 (stripper->f_lastchar_minus ? '-' : ' '),
23 (stripper->f_in_decl ? 'D' : ' '),
24 (stripper->f_in_comment ? 'C' : ' '),
25 (stripper->f_in_striptag ? 'X' : ' '),
26 (stripper->f_in_quote ? 'Q' : ' '),
27 (stripper->quote ? stripper->quote : ' ')
28 );
29 }
1430 if( stripper->f_in_tag ) {
1531 /* inside a tag */
1632 /* check if we know either the tagname, or that we're in a declaration */
2238 /* then check if the first character is a '/', in which case, this is a closing tag */
2339 else if( stripper->p_tagname == stripper->tagname && *p_raw == '/' ) {
2440 stripper->f_closing = 1;
25 } else {
41 }
42 /* if the first character wasn't a '/', and we're in a stripped block,
43 * assume this is a mathematical operator and reset */
44 else if( !stripper->f_closing && stripper->f_in_striptag && stripper->p_tagname == stripper->tagname && *p_raw != '/' ) {
45 stripper->f_in_tag = 0;
46 stripper->f_closing = 0;
47 /* we only care about closing tags within a stripped tags block (e.g. scripts) */
48 } else if( !stripper->f_in_striptag || stripper->f_closing ) {
2649 /* if we don't have the full tag name yet, add current character unless it's whitespace, a '/', or a '>';
2750 otherwise null pad the string and set the full tagname flag, and check the tagname against stripped ones.
2851 also sanity check we haven't reached the array bounds, and truncate the tagname here if we have */
4063 /* if we're outside a stripped tag block, check tagname against stripped tag list */
4164 } else if( !stripper->f_in_striptag && !stripper->f_closing ) {
4265 int i;
43 for( i = 0; i <= stripper->numstriptags; i++ ) {
66 for( i = 0; i < stripper->numstriptags; i++ ) {
4467 if( strcasecmp( stripper->tagname, stripper->o_striptags[i] ) == 0 ) {
4568 stripper->f_in_striptag = 1;
4669 strcpy( stripper->striptag, stripper->tagname );
6083 }
6184 } else {
6285 /* not in a quote */
63 /* check for quote characters */
64 if( *p_raw == '\'' || *p_raw == '\"' ) {
86 /* check for quote characters, but not in a comment */
87 if( !stripper->f_in_comment &&
88 ( *p_raw == '\'' || *p_raw == '\"' ) ) {
6589 stripper->f_in_quote = 1;
6690 stripper->quote = *p_raw;
6791 /* reset lastchar_* flags in case we have something perverse like '-"' or '/"' */
111135 /* output a space in place of tags we have previously parsed,
112136 and set a flag so we only do this once for every group of tags.
113137 done here to prevent unnecessary trailing spaces */
114 if( isspace(*p_raw) ) {
138 if( !isspace(*p_raw) &&
115139 /* don't output a space if this character is one anyway */
140 !stripper->f_outputted_space &&
141 stripper->f_just_seen_tag ) {
142 if( stripper->o_debug ) {
143 printf("SPACE ");
144 }
145 *p_output++ = ' ';
116146 stripper->f_outputted_space = 1;
117 } else {
118 if( !stripper->f_outputted_space &&
119 stripper->f_just_seen_tag ) {
120 *p_output++ = ' ';
121 stripper->f_outputted_space = 1;
122 } else {
123 /* this character must not be a space */
124 stripper->f_outputted_space = 0;
125 }
126147 }
148 }
149 if( stripper->o_debug ) {
150 printf("CHAR %c", *p_raw);
127151 }
128152 *p_output++ = *p_raw;
129153 /* reset 'just seen tag' flag */
130154 stripper->f_just_seen_tag = 0;
155 /* reset 'outputted space' flag if character is not one */
156 if (!isspace(*p_raw)) {
157 stripper->f_outputted_space = 0;
158 } else {
159 stripper->f_outputted_space = 1;
160 }
131161 }
132162 }
133163 } /* in tag check */
134164 p_raw++;
165 if( stripper->o_debug ) {
166 printf("\n");
167 }
135168 } /* while loop */
136169
137170 *p_output = 0;
171
172 if (stripper->o_auto_reset) {
173 reset( stripper );
174 }
138175 }
139176
140177 void
146183 /* hack to stop a space being output on strings starting with a tag */
147184 stripper->f_outputted_space = 1;
148185 stripper->f_just_seen_tag = 0;
149
186
150187 stripper->f_in_quote = 0;
151188
152189 stripper->f_in_decl = 0;
153190 stripper->f_in_comment = 0;
154191 stripper->f_lastchar_minus = 0;
155
192
156193 stripper->f_in_striptag = 0;
194
157195 }
158196
159197 void
171209 }
172210 }
173211
212 #ifdef _MSC_VER
213 #define strcasecmp(a,b) stricmp(a,b)
214 #endif
174215
175216 void
176217 check_end( Stripper * stripper, char end ) {
179220 stripper->f_lastchar_slash = 1;
180221 } else {
181222 /* if the current character is a '>', then the tag has ended */
182 if( end == '>' ) {
223 /* slight hack to deal with mathematical characters in script tags:
224 * if we're in a stripped block, and this is a closing tag, spaces
225 * will also end the tag, since we only want it for comparison with
226 * the opening one */
227 if( (end == '>') ||
228 (stripper->f_in_striptag && stripper->f_closing && isspace(end)) ) {
183229 stripper->f_in_quote = 0;
184230 stripper->f_in_comment = 0;
185231 stripper->f_in_decl = 0;
186232 stripper->f_in_tag = 0;
233 stripper->f_closing = 0;
187234 /* Do not start a stripped tag block if the tag is a closed one, e.g. '<script src="foo" />' */
188235 if( stripper->f_lastchar_slash &&
189236 (strcasecmp( stripper->striptag, stripper->tagname ) == 0) ) {
2626 int numstriptags;
2727 int o_emit_spaces;
2828 int o_decode_entities;
29
30 int o_auto_reset;
31
32 int o_debug;
2933 } Stripper;
3034
3135 void strip_html( Stripper * stripper, const char * raw, char * clean );
0
1 use Test::More tests => 5;
2
3 BEGIN { use_ok 'HTML::Strip' }
4
5 {
6 my $hs = HTML::Strip->new; # auto_reset off by default
7 my $o = $hs->parse( "<html>\nTitle\n<script>a+b\n" );
8 is( $o, "\nTitle\n" );
9 my $o2 = $hs->parse( "c+d\n</script>\nEnd\n</html>" );
10 is( $o2, "\nEnd\n" );
11 }
12
13 {
14 my $hs = HTML::Strip->new( auto_reset => 1 ); # auto_reset on
15 my $o = $hs->parse( "<html>\nTitle\n<script>a+b\n" );
16 is( $o, "\nTitle\n" );
17 my $o2 = $hs->parse( "c+d\n</script>\nEnd\n</html>" );
18 is( $o2, "c+d\n\nEnd\n" );
19 }
0 use Test::More tests => 19;
1
2 BEGIN { use_ok 'HTML::Strip' }
3
4 {
5 my $hs = new HTML::Strip;
6
7 is( $hs->parse( 'test' ), 'test', 'works with plain text' );
8 $hs->eof;
9
10 is( $hs->parse( '<em>test</em>' ), 'test', 'works with <em>|</em> tags' );
11 $hs->eof;
12
13 is( $hs->parse( 'foo<br>bar' ), 'foo bar', 'works with <br> tag' );
14 $hs->eof;
15
16 is( $hs->parse( '<p align="center">test</p>' ), 'test', 'works with tags with attributes' );
17 $hs->eof;
18
19 is( $hs->parse( '<p align="center>test</p>' ), '', '"works" with non-terminated quotes' );
20 $hs->eof;
21
22 is( $hs->parse( '<foo>bar' ), 'bar', 'strips <foo> tags' );
23 is( $hs->parse( '</foo>baz' ), ' baz', 'strips </foo> tags' );
24 $hs->eof;
25
26 is( $hs->parse( '<!-- <p>foo</p> bar -->baz' ), 'baz', 'strip comments' );
27 $hs->eof;
28
29 is( $hs->parse( '<img src="foo.gif" alt="a > b">bar' ), 'bar', 'works with quote attributes which contain >' );
30 $hs->eof;
31
32 is( $hs->parse( '<script> if (a>b && a<c) { ... } </script>bar' ), 'bar', '<script> tag and content are stripped' );
33 $hs->eof;
34
35 is( $hs->parse( '<# just data #>bar' ), 'bar', 'weird tags get stripped' );
36 $hs->eof;
37
38 TODO: {
39 local $TODO = "fix CDATA handling";
40 is( $hs->parse( '<![INCLUDE CDATA [ >>>>>>>>>>>> ]]>bar' ), 'bar', 'character data gets stripped' );
41 $hs->eof;
42 }
43
44 is( $hs->parse( '<script>foo</script>bar' ), 'bar', '<script> nodes are stripped' );
45 $hs->eof;
46
47 my $has_html_entities = eval { require HTML::Entities; 1 };
48 SKIP: {
49 skip 'HTML::Entities not available', 2 unless $has_html_entities;
50
51 is( $hs->parse( '&#060;foo&#062;' ), '<foo>', 'numeric HTML entities are decoded' );
52 $hs->eof;
53 is( $hs->parse( '&lt;foo&gt;' ), '<foo>', 'HTML entities are decoded' );
54 $hs->eof;
55 }
56
57 $hs->set_decode_entities(0);
58 is( $hs->parse( '&#060;foo&#062;' ), '&#060;foo&#062;', 'entities decoding off works' );
59 $hs->eof;
60
61 is( $hs->parse( '&lt;foo&gt;' ), '&lt;foo&gt;', 'entities decoding off works' );
62 $hs->eof;
63
64 is( $hs->parse( '<script>foo</script>bar' ), 'bar', '"script" is a default strip_tag' );
65 $hs->eof;
66 }
0
1 # http://rt.cpan.org/Public/Bug/Display.html?id=32355
2
3 use Test::More tests => 7;
4
5 BEGIN { use_ok 'HTML::Strip' }
6
7 # stripping declarations
8 {
9 my $hs = HTML::Strip->new();
10 is( $hs->parse( q{<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"><html>Text</html>} ),
11 "Text", 'decls are stripped' );
12 $hs->eof;
13 }
14
15 # stripping comments
16 {
17 my $hs = HTML::Strip->new();
18 is( $hs->parse( q{<html><!-- a comment to be stripped -->Hello World!</html>} ),
19 "Hello World!", "comments are stripped" );
20 $hs->eof;
21
22 is( $hs->parse( q{<html><!-- comment with a ' apos -->Hello World!</html>} ),
23 "Hello World!", q{comments may contain '} );
24 $hs->eof;
25
26 is( $hs->parse( q{<html><!-- comment with a " quote -->Hello World!</html>} ),
27 "Hello World!", q{comments may contain "} );
28 $hs->eof;
29
30 is( $hs->parse( q{<html><!-- comment -- "quote" >Hello World!</html>} ),
31 "Hello World!", "weird decls are stripped" );
32 $hs->eof;
33
34 is( $hs->parse( "a<>b" ),
35 "a b", 'edge case with <> ok' );
36
37 }
0 use Test::More;
1
2 BEGIN { use_ok 'HTML::Strip' }
3
4 # test for RT#21008
5
6 # stripping comments
7 {
8 my $hs = HTML::Strip->new();
9 is( $hs->parse( "a<>b" ), "a b", 'edge case with <> ok' );
10 $hs->eof;
11 is( $hs->parse( "a<>b c<>d" ), "a b c d", 'edge case with <>s ok' );
12 $hs->eof;
13 is( $hs->parse( "From: <>\n\na. Title: some text\n\nb. etc\n" ), "From: \n\na. Title: some text\n\nb. etc\n", 'test case' );
14 is( $hs->parse( "From: <>\n\na. Title: some text\n\nb. etc\n" ), "From: \n\na. Title: some text\n\nb. etc\n", 'test case' );
15 $hs->eof;
16 is( $hs->parse( q{this is an "example" with 'quoted' parts that should not be stripped} ), q{this is an "example" with 'quoted' parts that should not be stripped} );
17 }
18
19 done_testing;
0 use Test::More tests => 3;
1
2 BEGIN { use_ok 'HTML::Strip' }
3
4 {
5 my $hs = HTML::Strip->new( filter => undef );
6 ok( $hs->parse( '<html>&nbsp;</html>' ), '&nbsp;' );
7 $hs->eof;
8
9 }
10
11 {
12 my $filter = sub { my $s = shift; $s =~ s/\s/ /g;; $s };
13 my $hs = HTML::Strip->new( filter => $filter );
14 ok( $hs->parse( "<html>title\ntext\ntext</html>" ), 'title text text' );
15 $hs->eof;
16
17 }
0 use Test::More tests => 3;
1
2 BEGIN { use_ok 'HTML::Strip' }
3
4 # test for RT#35345
5 {
6 my $hs = HTML::Strip->new();
7 is( $hs->parse( <<EOF ), "\nHello\n", "mathematical comparisons in strip tags bug RT#35345" );
8 <script>
9 function shovelerMain (detectBuyBox) {
10 for (var i = 0; i < Shoveler.Instances.length; i++) {
11 ...
12 </script>
13 <h1>Hello</h1>
14 EOF
15 $hs->eof;
16 }
17
18 # test for RT#99207
19 {
20 my $hs = HTML::Strip->new();
21 is( $hs->parse( <<EOF ), "\nhallo\n", "mathematical comparisons in strip tags bug RT#99207" );
22 <script type="text/javascript">
23 document.write('<scr'+'ipt src="//www3.smartadserver.com/call/pubj/' + sas_config.pageid + '/' + formatid + '/' + sas_config.master + '/' + sas_config.tmstp + '/' + encodeURIComponent(target) + '?"></scr'+'ipt>');
24 </script>
25 <span>hallo</span>
26 EOF
27 $hs->eof;
28 }
29
0 use strict;
1
2 # test for RT#94713
3 use Test::More tests => 1;
4
5 my $INC = join ' ', map { "-I$_" } @INC;
6
7 SKIP: {
8 skip "test fails on windows", 1 if $^O eq 'MSWin32';
9 is(`MALLOC_OPTIONS=Z $^X $INC -MHTML::Strip -e 'print HTML::Strip->new->parse(q[<li>abc < 0.5 km</li><li>xyz</li>])'`, q[abc xyz]);
10 }
11
0 use Test::More tests => 6;
1
2 BEGIN { use_ok 'HTML::Strip' }
3
4 {
5 # set_striptags( \@ARRAY )
6 my $hs = HTML::Strip->new;
7 $hs->set_striptags( [ 'foo' ] );
8
9 is( $hs->parse( '<script>foo</script>bar' ), 'foo bar', 'set_striptags redefinition works' );
10 $hs->eof;
11
12 is( $hs->parse( '<foo>foo</foo>bar' ), 'bar', 'set_striptags redefinition works' );
13 $hs->eof;
14 }
15
16 {
17 # set_striptags( LIST )
18 my @striptags = qw(baz quux);
19 my $hs = HTML::Strip->new;
20 $hs->set_striptags( @striptags );
21
22 is( $hs->parse( '<baz>fumble</baz>bar<quux>foo</quux>' ), 'bar', 'stripping user-defined tags ok' );
23 $hs->eof;
24
25 is( $hs->parse( '<baz>fumble<quux/>foo</baz>bar' ), 'bar', 'stripping user-defined tags ok' );
26 $hs->eof;
27
28 is( $hs->parse( '<foo> </foo> <bar> baz </bar>' ), ' baz ', 'stripping user-defined tags ok' );
29 $hs->eof;
30 }
0 use Test::More tests => 2;
1
2 BEGIN { use_ok 'HTML::Strip' }
3
4 # test for RT#19036
5 {
6 my $hs = HTML::Strip->new();
7 is( $hs->parse( '<tr><td>01 May 2006</td><td>0</td><td>10</td></tr>' ), '01 May 2006 0 10', "whitespace single character bug" );
8 $hs->eof;
9 }
+0
-91
test.pl less more
0 # Before `make install' is performed this script should be runnable with
1 # `make test'. After `make install' it should work as `perl test.pl'
2
3 #########################
4
5 # change 'tests => 1' to 'tests => last_test_to_print';
6
7 use Test;
8 BEGIN { plan tests => 17 };
9 use HTML::Strip;
10 ok(1); # If we made it this far, we're ok.
11
12 #########################
13
14 # Insert your test code below, the Test module is use()ed here so read
15 # its man page ( perldoc Test ) for help writing this test script.
16
17 my $hs = new HTML::Strip;
18
19 ok( $hs->parse( 'test' ), 'test' );
20 $hs->eof;
21
22 ok( $hs->parse( '<em>test</em>' ), 'test' );
23 $hs->eof;
24
25 ok( $hs->parse( 'foo<br>bar' ), 'foo bar' );
26 $hs->eof;
27
28 ok( $hs->parse( '<p align="center">test</p>' ), 'test' );
29 $hs->eof;
30
31 ok( $hs->parse( '<p align="center>test</p>' ), '' );
32 $hs->eof;
33
34 ok( $hs->parse( '<foo>bar' ), 'bar' );
35 ok( $hs->parse( '</foo>baz' ), ' baz' );
36 $hs->eof;
37
38 ok( $hs->parse( '<!-- <p>foo</p> bar -->baz' ), 'baz' );
39 $hs->eof;
40
41 ok( $hs->parse( '<img src="foo.gif" alt="a > b">bar' ), 'bar' );
42 $hs->eof;
43
44 ok( $hs->parse( '<script>if (a<b && a>c)</script>bar' ), 'bar' );
45 $hs->eof;
46
47 ok( $hs->parse( '<# just data #>bar' ), 'bar' );
48 $hs->eof;
49
50 #ok( $hs->parse( '<![INCLUDE CDATA [ >>>>>>>>>>>> ]]>bar' ), 'bar' );
51 #$hs->eof;
52
53 ok( $hs->parse( '<script>foo</script>bar' ), 'bar' );
54 $hs->eof;
55
56 my $html_entities_p = eval 'require HTML::Entities' ? '' : 'HTML::Entities not available';
57 skip( $html_entities_p, $hs->parse( '&#060;foo&#062;' ), '<foo>' );
58 $hs->eof;
59 skip( $html_entities_p, $hs->parse( '&lt;foo&gt;' ), '<foo>' );
60 $hs->eof;
61 $hs->set_decode_entities(0);
62 skip( $html_entities_p, $hs->parse( '&#060;foo&#062;' ), '&#060;foo&#062;' );
63 $hs->eof;
64 skip( $html_entities_p, $hs->parse( '&lt;foo&gt;' ), '&lt;foo&gt;' );
65 $hs->eof;
66
67
68 my $hs2 = new HTML::Strip;
69 $hs2->set_striptags( [ 'foo' ] );
70
71 ok( $hs2->parse( '<script>foo</script>bar' ), 'foo bar' );
72 $hs2->eof;
73
74 ok( $hs2->parse( '<foo>foo</foo>bar' ), 'bar' );
75 $hs2->eof;
76
77 ok( $hs->parse( '<script>foo</script>bar' ), 'bar' );
78 $hs->eof;
79
80 my @striptags = qw(baz quux);
81 $hs->set_striptags( @striptags );
82
83 ok( $hs->parse( '<baz>fumble</baz>bar<quux>foo</quux>' ), 'bar' );
84 $hs->eof;
85
86 ok( $hs->parse( '<baz>fumble<quux/>foo</baz>bar' ), 'bar' );
87 $hs->eof;
88
89 ok( $hs->parse( '<foo> </foo> <bar> baz </bar>' ), ' baz ' );
90 $hs->eof;