Commit b0792edf603a8b4b595162146bc540f9953bb9ea - libhtml-strip-perl

+25

-0

Changes less more

0	0	Revision history for Perl extension HTML::Strip.
	1
	2	1.10 Tue Sep 30 14:34:47 UTC 2014
	3	- Fix for RT#99207 (script mathematical symbol bug).
	4
	5	1.09 Tue Sep 30 10:39:47 UTC 2014
	6	- offbyone.t disabled under Windows (RT#99219)
	7
	8	1.08 Fri Sep 26 15:02:37 UTC 2014
	9	- system perl used in offbyone.t (RT#99151)
	10
	11	1.07 Tue Sep 23 14:44:08 UTC 2014
	12	- fix to bug RT#19036 - tags not replaced with spaces when only a single
	13	character is between the tags
	14	- fix to bug RT#35345 - mathematical conparisons within <script> tags
	15	misunderstood
	16	(patches contributed by Adriano Ferreira)
	17	- Exporter was never needed
	18	- Allow other filtering operations than just decoding of HTML entities
	19	- Modernised test suite
	20	- Adds 'auto_reset' attribute, which allows automagic use of $hs->eof
	21	- fixes quotes in html comments (RT#32355)
	22	(patch contributed by Reini Urban)
	23	- MSVC doesnt define strcasecmp, use stricmp instead
	24	(patch contributed by Damyan Ivanov)
	25	- fixes POD errors
1	26
2	27	1.06 Fri Feb 10 11:18:35 2006
3	28	- documented 'set_decode_entities' method

+10

-1

MANIFEST less more

0	0	Changes
1	1	Makefile.PL
2	2	MANIFEST
	3	META.yml
3	4	README
4	5	Strip.pm
5	6	Strip.xs
6	7	strip_html.h
7	8	strip_html.c
8	9	typemap
9		test.pl
	10	t/basic.t
	11	t/auto-reset.t
	12	t/comment.t
	13	t/edge-case.t
	14	t/filter.t
	15	t/mathematical-comparisons.t
	16	t/offbyone.t
	17	t/striptags.t
	18	t/whitespace-single-char.t

+22

-0

META.yml less more

	0	---
	1	abstract: 'Perl extension for stripping HTML markup from text.'
	2	author:
	3	- 'Alex Bowley <kilinrax@cpan.org>'
	4	build_requires:
	5	ExtUtils::MakeMaker: 0
	6	configure_requires:
	7	ExtUtils::MakeMaker: 0
	8	distribution_type: module
	9	dynamic_config: 0
	10	generated_by: 'ExtUtils::MakeMaker version 6.57_05'
	11	license: unknown
	12	meta-spec:
	13	url: http://module-build.sourceforge.net/META-spec-v1.4.html
	14	version: 1.4
	15	name: HTML-Strip
	16	no_index:
	17	directory:
	18	- t
	19	- inc
	20	requires: {}
	21	version: 1.08

+32

-14

Makefile.PL less more

	0
	1	use 5.006;
0	2	use ExtUtils::MakeMaker;
	3	my $EUMM_VERSION = eval $ExtUtils::MakeMaker::VERSION;
1	4
2		# See lib/ExtUtils/MakeMaker.pm for details of how to influence
3		# the contents of the Makefile that is written.
4	5	WriteMakefile(
5		'NAME' => 'HTML::Strip',
6		'VERSION_FROM' => 'Strip.pm', # finds $VERSION
7		'PREREQ_PM' => {}, # e.g., Module::Name => 1.1
8		($] >= 5.005 ? ## Add these new keywords supported since 5.005
9		(ABSTRACT_FROM => 'Strip.pm', # retrieve abstract from module
10		AUTHOR => 'Alex Bowley <kilinrax@cpan.org>') : ()),
11		'LIBS' => [''], # e.g., '-lm'
12		'DEFINE' => '', # e.g., '-DHAVE_SOMETHING'
13		# Insert -I. if you add *.h files later:
14		'INC' => '', # e.g., '-I/usr/include/other'
15		# Un-comment this if you add C files to link with later:
16		'OBJECT' => '$(O_FILES)', # link all the C files too
	6	'NAME' => 'HTML::Strip',
	7	'VERSION_FROM' => 'Strip.pm',
	8	'PREREQ_PM' => {
	9	# core modules
	10	'warnings' => 0,
	11	'strict' => 0,
	12	'Carp' => 0,
	13	'Exporter' => 0,
	14	'DynaLoader' => 0,
	15
	16	# build requires
	17	'Test::More' => 0,
	18	},
	19	( $] >= 5.005 ? (
	20	ABSTRACT_FROM => 'Strip.pm',
	21	AUTHOR => 'Alex Bowley <kilinrax@cpan.org>'
	22	) : () ),
	23	( $EUMM_VERSION >= 6.46 ? (
	24	LICENSE => 'perl',
	25	META_MERGE => {
	26	recommended => {
	27	'HTML::Entities' => 0,
	28	},
	29	},
	30	) : () ),
	31	'LIBS' => [''], # e.g., '-lm'
	32	'DEFINE' => '', # e.g., '-DHAVE_SOMETHING'
	33	'INC' => '', # e.g., '-I/usr/include/other'
	34	'OBJECT' => '$(O_FILES)', # link all the C files too
17	35	);

+122

-67

Strip.pm less more

0	0	package HTML::Strip;
	1
	2	require DynaLoader;
	3	our @ISA = qw(DynaLoader);
	4	our $VERSION = '1.10';
	5	bootstrap HTML::Strip $VERSION;
1	6
2	7	use 5.006;
3	8	use warnings;
4	9	use strict;
5	10
6		use Carp qw( carp croak );
7
8		require Exporter;
9		require DynaLoader;
10
11		our @ISA = qw(Exporter DynaLoader);
12
13		# Items to export into callers namespace by default. Note: do not export
14		# names by default without a very good reason. Use EXPORT_OK instead.
15		# Do not simply export all your public functions/methods/constants.
16
17		# This allows declaration use HTML::Strip ':all';
18		# If you do not need this, moving things directly into @EXPORT or @EXPORT_OK
19		# will save memory.
20		our %EXPORT_TAGS = ( 'all' => [ qw(
21		) ] );
22
23		our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
24
25		our @EXPORT = qw();
26
27		our $VERSION = '1.06';
28
29		bootstrap HTML::Strip $VERSION;
30
31		# Preloaded methods go here.
32
33		my $_html_entities_p = eval 'require HTML::Entities';
	11	use Carp;
	12
	13	my $_html_entities_p = eval { require HTML::Entities; 1 };
34	14
35	15	my %defaults = (
36		striptags => [qw( title
37		style
38		script
39		applet )],
40		emit_spaces => 1,
41		decode_entities => 1,
42		);
	16	striptags => [qw( title
	17	style
	18	script
	19	applet )],
	20	emit_spaces => 1,
	21	decode_entities => 1,
	22	filter => $_html_entities_p ? 'filter_entities' : undef,
	23	auto_reset => 0,
	24	debug => 0,
	25	);
43	26
44	27	sub new {
45		my $class = shift;
46		my $obj = create();
47		bless $obj, $class;
48
49		my %args = (%defaults, @_);
50		while( my ($key, $value) = each %args ) {
51		my $method = "set_${key}";
52		if( $obj->can($method) ) {
53		$obj->$method($value);
	28	my $class = shift;
	29	my $obj = create();
	30	bless $obj, $class;
	31
	32	my %args = (%defaults, @_);
	33	while( my ($key, $value) = each %args ) {
	34	my $method = "set_${key}";
	35	if( $obj->can($method) ) {
	36	$obj->$method($value);
	37	} else {
	38	Carp::carp "Invalid setting '$key'";
	39	}
	40	}
	41	return $obj;
	42	}
	43
	44	sub set_striptags {
	45	my ($self, @tags) = @_;
	46	if( ref($tags[0]) eq 'ARRAY' ) {
	47	$self->set_striptags_ref( $tags[0] );
54	48	} else {
55		carp "Invalid setting '$key'";
56		}
57		}
58		return $obj;
59		}
60
61		sub set_striptags {
62		my ($self, @tags) = @_;
63		if( ref($tags[0]) eq 'ARRAY' ) {
64		$self->set_striptags_ref( $tags[0] );
65		} else {
66		$self->set_striptags_ref( \@tags );
67		}
	49	$self->set_striptags_ref( \@tags );
	50	}
	51	}
	52
	53	{
	54	# an inside-out object approach
	55	# for the 'filter' attribute
	56	my %filter_of;
	57
	58	sub set_filter {
	59	my ($self, $filter) = @_;
	60	$filter_of{0+$self} = $filter;
	61	}
	62
	63	sub filter {
	64	my $self = shift;
	65	return $filter_of{0+$self}
	66	}
	67
	68	sub DESTROY {
	69	my $self = shift;
	70	delete $filter_of{0+$self};
	71	}
	72	}
	73
	74	# $decoded_string = $self->filter_entities( $string )
	75	sub filter_entities {
	76	my $self = shift;
	77	if( $self->decode_entities ) {
	78	return HTML::Entities::decode($_[0]);
	79	}
	80	return $_[0];
	81	}
	82
	83	sub _do_filter {
	84	my $self = shift;
	85	my $filter = $self->filter;
	86	# no filter: return immediately
	87	return $_[0] unless defined $filter;
	88
	89	if ( !ref $filter ) { # method name
	90	return $self->$filter( @_ );
	91	} else { # code ref
	92	return $filter->( @_ );
	93	}
68	94	}
69	95
70	96	sub parse {
71		my ($self, $text) = @_;
72		my $stripped = $self->strip_html( $text );
73		if( $self->decode_entities && $_html_entities_p ) {
74		$stripped = HTML::Entities::decode($stripped);
75		}
76		return $stripped;
	97	my ($self, $text) = @_;
	98	my $stripped = $self->strip_html( $text );
	99	return $self->_do_filter( $stripped );
77	100	}
78	101
79	102	sub eof {
80		my $self = shift;
81		$self->reset();
	103	my $self = shift;
	104	$self->reset();
82	105	}
83	106
84	107	1;

135	158	declaration or a comment. Within such tags, C<E<gt>> characters do not
136	159	end the tag if they appear within pairs of double dashes (e.g. C<E<lt>!--
137	160	E<lt>a href="old.htm"E<gt>old pageE<lt>/aE<gt> --E<gt>> would be
138		stripped completely).
	161	stripped completely). Inside a comment, no parsing for quotes
	162	is done as well. (That means C<E<lt>!-- comment with ' quote " --E<gt>>
	163	are entirely stripped.)
139	164
140	165	=back
141	166

153	178	next call to parse to start with the remains of said tag.
154	179
155	180	If this is not going to be the case, be sure to call $hs->eof()
156		between calls to $hs->parse().
	181	between calls to $hs->parse(). Alternatively, you may
	182	set C<auto_reset> to true on the constructor or any time
	183	after with C<set_auto_reset>, so that the parser will always
	184	operate in one-shot basis (resetting after each parsed chunk).
157	185
158	186	=head2 METHODS
	187
	188	=over
159	189
160	190	=item new()
161	191

200	230	Takes a boolean value. If set to false, HTML::Strip will decode HTML
201	231	entities. Set to true by default.
202	232
	233	=item filter_entities()
	234
	235	If HTML::Entities is available, this method behaves just
	236	like invoking HTML::Entities::decode_entities, except that
	237	it respects the current setting of 'decode_entities'.
	238
	239	=item set_filter()
	240
	241	Sets a filter to be applied after tags were stripped.
	242	It may accept the name of a method (like 'filter_entities')
	243	or a code ref. By default, its value is 'filter_entities'
	244	if HTML::Entities is available or C<undef> otherwise.
	245
	246	=item set_auto_reset()
	247
	248	Takes a boolean value. If set to true, C<parse> resets after
	249	each call (equivalent to calling C<eof>). Otherwise, the
	250	parser remembers its state from one call to C<parse> to
	251	another, until you call C<eof> explicitly. Set to false
	252	by default.
	253
	254	=back
	255
203	256	=head2 LIMITATIONS
204	257
205	258	=over 4

226	279	HTML::Strip will only attempt decoding of HTML entities if
227	280	L<HTML::Entities> is installed.
228	281
	282	=back
	283
229	284	=head2 EXPORT
230	285
231	286	None by default.

+33

-3

Strip.xs less more

4	4
5	5	#include "strip_html.h"
6	6
7		MODULE = HTML::Strip PACKAGE = HTML::Strip
	7	MODULE = HTML::Strip PACKAGE = HTML::Strip
8	8
9	9	PROTOTYPES: ENABLE
10	10

38	38	strip_html( stripper, raw, clean );
39	39	RETVAL = clean;
40	40	OUTPUT:
41		RETVAL
	41	RETVAL
42	42	CLEANUP:
43	43	Safefree( clean );
44	44

87	87	int n;
88	88	if( (SvROK(tagref)) &&
89	89	(SvTYPE(SvRV(tagref)) == SVt_PVAV) ) {
90		tags = (AV *) SvRV(tagref);
	90	tags = (AV *) SvRV(tagref);
91	91	} else {
92	92	XSRETURN_UNDEF;
93	93	}

102	102	char * tag = SvPV(*av_fetch(tags, n, 0), l);
103	103	add_striptag( stripper, tag );
104	104	}
	105
	106	void
	107	set_auto_reset( stripper, value )
	108	Stripper * stripper
	109	int value
	110	CODE:
	111	stripper->o_auto_reset = value;
	112
	113	int
	114	auto_reset( stripper )
	115	Stripper * stripper
	116	CODE:
	117	RETVAL = stripper->o_auto_reset;
	118	OUTPUT:
	119	RETVAL
	120
	121	void
	122	set_debug( stripper, value )
	123	Stripper * stripper
	124	int value
	125	CODE:
	126	stripper->o_debug = value;
	127
	128	int
	129	debug( stripper )
	130	Stripper * stripper
	131	CODE:
	132	RETVAL = stripper->o_debug;
	133	OUTPUT:
	134	RETVAL

+67

-20

strip_html.c less more

0
1	0	#include <stdio.h>
2	1	#include <ctype.h>
3	2	#include <string.h>
4	3	#include "strip_html.h"
5	4
6
7	5	void
8	6	strip_html( Stripper * stripper, const char * raw, char * output ) {
9	7	const char * p_raw = raw;
10	8	const char * raw_end = raw + strlen(raw);
11	9	char * p_output = output;
12
	10
13	11	while( p_raw < raw_end ) {
	12	if( stripper->o_debug ) {
	13	printf( "[DEBUG] char %c state %c %c %c tag:%5s, %c %c %c %c, %c %c %c %c:%c, ",
	14	*p_raw,
	15	(stripper->f_closing ? 'C' : ' '),
	16	(stripper->f_in_tag ? 'T' : ' '),
	17	(stripper->f_full_tagname ? 'F' : ' '),
	18	stripper->tagname,
	19	(stripper->f_just_seen_tag ? 'J' : ' '),
	20	(stripper->f_outputted_space ? 'S' : ' '),
	21	(stripper->f_lastchar_slash ? '/' : ' '),
	22	(stripper->f_lastchar_minus ? '-' : ' '),
	23	(stripper->f_in_decl ? 'D' : ' '),
	24	(stripper->f_in_comment ? 'C' : ' '),
	25	(stripper->f_in_striptag ? 'X' : ' '),
	26	(stripper->f_in_quote ? 'Q' : ' '),
	27	(stripper->quote ? stripper->quote : ' ')
	28	);
	29	}
14	30	if( stripper->f_in_tag ) {
15	31	/* inside a tag */
16	32	/* check if we know either the tagname, or that we're in a declaration */

22	38	/* then check if the first character is a '/', in which case, this is a closing tag */
23	39	else if( stripper->p_tagname == stripper->tagname && *p_raw == '/' ) {
24	40	stripper->f_closing = 1;
25		} else {
	41	}
	42	/* if the first character wasn't a '/', and we're in a stripped block,
	43	* assume this is a mathematical operator and reset */
	44	else if( !stripper->f_closing && stripper->f_in_striptag && stripper->p_tagname == stripper->tagname && *p_raw != '/' ) {
	45	stripper->f_in_tag = 0;
	46	stripper->f_closing = 0;
	47	/* we only care about closing tags within a stripped tags block (e.g. scripts) */
	48	} else if( !stripper->f_in_striptag \|\| stripper->f_closing ) {
26	49	/* if we don't have the full tag name yet, add current character unless it's whitespace, a '/', or a '>';
27	50	otherwise null pad the string and set the full tagname flag, and check the tagname against stripped ones.
28	51	also sanity check we haven't reached the array bounds, and truncate the tagname here if we have */

40	63	/* if we're outside a stripped tag block, check tagname against stripped tag list */
41	64	} else if( !stripper->f_in_striptag && !stripper->f_closing ) {
42	65	int i;
43		for( i = 0; i <= stripper->numstriptags; i++ ) {
	66	for( i = 0; i < stripper->numstriptags; i++ ) {
44	67	if( strcasecmp( stripper->tagname, stripper->o_striptags[i] ) == 0 ) {
45	68	stripper->f_in_striptag = 1;
46	69	strcpy( stripper->striptag, stripper->tagname );

60	83	}
61	84	} else {
62	85	/* not in a quote */
63		/* check for quote characters */
64		if( p_raw == '\'' \|\| p_raw == '\"' ) {
	86	/* check for quote characters, but not in a comment */
	87	if( !stripper->f_in_comment &&
	88	( p_raw == '\'' \|\| p_raw == '\"' ) ) {
65	89	stripper->f_in_quote = 1;
66	90	stripper->quote = *p_raw;
67	91	/* reset lastchar_* flags in case we have something perverse like '-"' or '/"' */

111	135	/* output a space in place of tags we have previously parsed,
112	136	and set a flag so we only do this once for every group of tags.
113	137	done here to prevent unnecessary trailing spaces */
114		if( isspace(*p_raw) ) {
	138	if( !isspace(*p_raw) &&
115	139	/* don't output a space if this character is one anyway */
	140	!stripper->f_outputted_space &&
	141	stripper->f_just_seen_tag ) {
	142	if( stripper->o_debug ) {
	143	printf("SPACE ");
	144	}
	145	*p_output++ = ' ';
116	146	stripper->f_outputted_space = 1;
117		} else {
118		if( !stripper->f_outputted_space &&
119		stripper->f_just_seen_tag ) {
120		*p_output++ = ' ';
121		stripper->f_outputted_space = 1;
122		} else {
123		/* this character must not be a space */
124		stripper->f_outputted_space = 0;
125		}
126	147	}
	148	}
	149	if( stripper->o_debug ) {
	150	printf("CHAR %c", *p_raw);
127	151	}
128	152	p_output++ = p_raw;
129	153	/* reset 'just seen tag' flag */
130	154	stripper->f_just_seen_tag = 0;
	155	/* reset 'outputted space' flag if character is not one */
	156	if (!isspace(*p_raw)) {
	157	stripper->f_outputted_space = 0;
	158	} else {
	159	stripper->f_outputted_space = 1;
	160	}
131	161	}
132	162	}
133	163	} /* in tag check */
134	164	p_raw++;
	165	if( stripper->o_debug ) {
	166	printf("\n");
	167	}
135	168	} /* while loop */
136	169
137	170	*p_output = 0;
	171
	172	if (stripper->o_auto_reset) {
	173	reset( stripper );
	174	}
138	175	}
139	176
140	177	void

146	183	/* hack to stop a space being output on strings starting with a tag */
147	184	stripper->f_outputted_space = 1;
148	185	stripper->f_just_seen_tag = 0;
149
	186
150	187	stripper->f_in_quote = 0;
151	188
152	189	stripper->f_in_decl = 0;
153	190	stripper->f_in_comment = 0;
154	191	stripper->f_lastchar_minus = 0;
155
	192
156	193	stripper->f_in_striptag = 0;
	194
157	195	}
158	196
159	197	void

171	209	}
172	210	}
173	211
	212	#ifdef _MSC_VER
	213	#define strcasecmp(a,b) stricmp(a,b)
	214	#endif
174	215
175	216	void
176	217	check_end( Stripper * stripper, char end ) {

179	220	stripper->f_lastchar_slash = 1;
180	221	} else {
181	222	/* if the current character is a '>', then the tag has ended */
182		if( end == '>' ) {
	223	/* slight hack to deal with mathematical characters in script tags:
	224	* if we're in a stripped block, and this is a closing tag, spaces
	225	* will also end the tag, since we only want it for comparison with
	226	* the opening one */
	227	if( (end == '>') \|\|
	228	(stripper->f_in_striptag && stripper->f_closing && isspace(end)) ) {
183	229	stripper->f_in_quote = 0;
184	230	stripper->f_in_comment = 0;
185	231	stripper->f_in_decl = 0;
186	232	stripper->f_in_tag = 0;
	233	stripper->f_closing = 0;
187	234	/* Do not start a stripped tag block if the tag is a closed one, e.g. '<script src="foo" />' */
188	235	if( stripper->f_lastchar_slash &&
189	236	(strcasecmp( stripper->striptag, stripper->tagname ) == 0) ) {

+4

-0

strip_html.h less more

26	26	int numstriptags;
27	27	int o_emit_spaces;
28	28	int o_decode_entities;
	29
	30	int o_auto_reset;
	31
	32	int o_debug;
29	33	} Stripper;
30	34
31	35	void strip_html( Stripper * stripper, const char * raw, char * clean );

+20

-0

t/auto-reset.t less more

	0
	1	use Test::More tests => 5;
	2
	3	BEGIN { use_ok 'HTML::Strip' }
	4
	5	{
	6	my $hs = HTML::Strip->new; # auto_reset off by default
	7	my $o = $hs->parse( "<html>\nTitle\n<script>a+b\n" );
	8	is( $o, "\nTitle\n" );
	9	my $o2 = $hs->parse( "c+d\n</script>\nEnd\n</html>" );
	10	is( $o2, "\nEnd\n" );
	11	}
	12
	13	{
	14	my $hs = HTML::Strip->new( auto_reset => 1 ); # auto_reset on
	15	my $o = $hs->parse( "<html>\nTitle\n<script>a+b\n" );
	16	is( $o, "\nTitle\n" );
	17	my $o2 = $hs->parse( "c+d\n</script>\nEnd\n</html>" );
	18	is( $o2, "c+d\n\nEnd\n" );
	19	}

+67

-0

t/basic.t less more

	0	use Test::More tests => 19;
	1
	2	BEGIN { use_ok 'HTML::Strip' }
	3
	4	{
	5	my $hs = new HTML::Strip;
	6
	7	is( $hs->parse( 'test' ), 'test', 'works with plain text' );
	8	$hs->eof;
	9
	10	is( $hs->parse( '<em>test</em>' ), 'test', 'works with <em>\|</em> tags' );
	11	$hs->eof;
	12
	13	is( $hs->parse( 'foo<br>bar' ), 'foo bar', 'works with <br> tag' );
	14	$hs->eof;
	15
	16	is( $hs->parse( '<p align="center">test</p>' ), 'test', 'works with tags with attributes' );
	17	$hs->eof;
	18
	19	is( $hs->parse( '<p align="center>test</p>' ), '', '"works" with non-terminated quotes' );
	20	$hs->eof;
	21
	22	is( $hs->parse( '<foo>bar' ), 'bar', 'strips <foo> tags' );
	23	is( $hs->parse( '</foo>baz' ), ' baz', 'strips </foo> tags' );
	24	$hs->eof;
	25
	26	is( $hs->parse( '<!-- <p>foo</p> bar -->baz' ), 'baz', 'strip comments' );
	27	$hs->eof;
	28
	29	is( $hs->parse( '<img src="foo.gif" alt="a > b">bar' ), 'bar', 'works with quote attributes which contain >' );
	30	$hs->eof;
	31
	32	is( $hs->parse( '<script> if (a>b && a<c) { ... } </script>bar' ), 'bar', '<script> tag and content are stripped' );
	33	$hs->eof;
	34
	35	is( $hs->parse( '<# just data #>bar' ), 'bar', 'weird tags get stripped' );
	36	$hs->eof;
	37
	38	TODO: {
	39	local $TODO = "fix CDATA handling";
	40	is( $hs->parse( '<![INCLUDE CDATA [ >>>>>>>>>>>> ]]>bar' ), 'bar', 'character data gets stripped' );
	41	$hs->eof;
	42	}
	43
	44	is( $hs->parse( '<script>foo</script>bar' ), 'bar', '<script> nodes are stripped' );
	45	$hs->eof;
	46
	47	my $has_html_entities = eval { require HTML::Entities; 1 };
	48	SKIP: {
	49	skip 'HTML::Entities not available', 2 unless $has_html_entities;
	50
	51	is( $hs->parse( '<foo>' ), '<foo>', 'numeric HTML entities are decoded' );
	52	$hs->eof;
	53	is( $hs->parse( '<foo>' ), '<foo>', 'HTML entities are decoded' );
	54	$hs->eof;
	55	}
	56
	57	$hs->set_decode_entities(0);
	58	is( $hs->parse( '<foo>' ), '<foo>', 'entities decoding off works' );
	59	$hs->eof;
	60
	61	is( $hs->parse( '<foo>' ), '<foo>', 'entities decoding off works' );
	62	$hs->eof;
	63
	64	is( $hs->parse( '<script>foo</script>bar' ), 'bar', '"script" is a default strip_tag' );
	65	$hs->eof;
	66	}

+38

-0

t/comment.t less more

	0
	1	# http://rt.cpan.org/Public/Bug/Display.html?id=32355
	2
	3	use Test::More tests => 7;
	4
	5	BEGIN { use_ok 'HTML::Strip' }
	6
	7	# stripping declarations
	8	{
	9	my $hs = HTML::Strip->new();
	10	is( $hs->parse( q{<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"><html>Text</html>} ),
	11	"Text", 'decls are stripped' );
	12	$hs->eof;
	13	}
	14
	15	# stripping comments
	16	{
	17	my $hs = HTML::Strip->new();
	18	is( $hs->parse( q{<html><!-- a comment to be stripped -->Hello World!</html>} ),
	19	"Hello World!", "comments are stripped" );
	20	$hs->eof;
	21
	22	is( $hs->parse( q{<html><!-- comment with a ' apos -->Hello World!</html>} ),
	23	"Hello World!", q{comments may contain '} );
	24	$hs->eof;
	25
	26	is( $hs->parse( q{<html><!-- comment with a " quote -->Hello World!</html>} ),
	27	"Hello World!", q{comments may contain "} );
	28	$hs->eof;
	29
	30	is( $hs->parse( q{<html><!-- comment -- "quote" >Hello World!</html>} ),
	31	"Hello World!", "weird decls are stripped" );
	32	$hs->eof;
	33
	34	is( $hs->parse( "a<>b" ),
	35	"a b", 'edge case with <> ok' );
	36
	37	}

+20

-0

t/edge-case.t less more

	0	use Test::More;
	1
	2	BEGIN { use_ok 'HTML::Strip' }
	3
	4	# test for RT#21008
	5
	6	# stripping comments
	7	{
	8	my $hs = HTML::Strip->new();
	9	is( $hs->parse( "a<>b" ), "a b", 'edge case with <> ok' );
	10	$hs->eof;
	11	is( $hs->parse( "a<>b c<>d" ), "a b c d", 'edge case with <>s ok' );
	12	$hs->eof;
	13	is( $hs->parse( "From: <>\n\na. Title: some text\n\nb. etc\n" ), "From: \n\na. Title: some text\n\nb. etc\n", 'test case' );
	14	is( $hs->parse( "From: <>\n\na. Title: some text\n\nb. etc\n" ), "From: \n\na. Title: some text\n\nb. etc\n", 'test case' );
	15	$hs->eof;
	16	is( $hs->parse( q{this is an "example" with 'quoted' parts that should not be stripped} ), q{this is an "example" with 'quoted' parts that should not be stripped} );
	17	}
	18
	19	done_testing;

+18

-0

t/filter.t less more

	0	use Test::More tests => 3;
	1
	2	BEGIN { use_ok 'HTML::Strip' }
	3
	4	{
	5	my $hs = HTML::Strip->new( filter => undef );
	6	ok( $hs->parse( '<html> </html>' ), ' ' );
	7	$hs->eof;
	8
	9	}
	10
	11	{
	12	my $filter = sub { my $s = shift; $s =~ s/\s/ /g;; $s };
	13	my $hs = HTML::Strip->new( filter => $filter );
	14	ok( $hs->parse( "<html>title\ntext\ntext</html>" ), 'title text text' );
	15	$hs->eof;
	16
	17	}

+30

-0

t/mathematical-comparisons.t less more

	0	use Test::More tests => 3;
	1
	2	BEGIN { use_ok 'HTML::Strip' }
	3
	4	# test for RT#35345
	5	{
	6	my $hs = HTML::Strip->new();
	7	is( $hs->parse( <<EOF ), "\nHello\n", "mathematical comparisons in strip tags bug RT#35345" );
	8	<script>
	9	function shovelerMain (detectBuyBox) {
	10	for (var i = 0; i < Shoveler.Instances.length; i++) {
	11	...
	12	</script>
	13	<h1>Hello</h1>
	14	EOF
	15	$hs->eof;
	16	}
	17
	18	# test for RT#99207
	19	{
	20	my $hs = HTML::Strip->new();
	21	is( $hs->parse( <<EOF ), "\nhallo\n", "mathematical comparisons in strip tags bug RT#99207" );
	22	<script type="text/javascript">
	23	document.write('<scr'+'ipt src="//www3.smartadserver.com/call/pubj/' + sas_config.pageid + '/' + formatid + '/' + sas_config.master + '/' + sas_config.tmstp + '/' + encodeURIComponent(target) + '?"></scr'+'ipt>');
	24	</script>
	25	<span>hallo</span>
	26	EOF
	27	$hs->eof;
	28	}
	29

+12

-0

t/offbyone.t less more

	0	use strict;
	1
	2	# test for RT#94713
	3	use Test::More tests => 1;
	4
	5	my $INC = join ' ', map { "-I$_" } @INC;
	6
	7	SKIP: {
	8	skip "test fails on windows", 1 if $^O eq 'MSWin32';
	9	is(`MALLOC_OPTIONS=Z $^X $INC -MHTML::Strip -e 'print HTML::Strip->new->parse(q[<li>abc < 0.5 km</li><li>xyz</li>])'`, q[abc xyz]);
	10	}
	11

+31

-0

t/striptags.t less more

	0	use Test::More tests => 6;
	1
	2	BEGIN { use_ok 'HTML::Strip' }
	3
	4	{
	5	# set_striptags( \@ARRAY )
	6	my $hs = HTML::Strip->new;
	7	$hs->set_striptags( [ 'foo' ] );
	8
	9	is( $hs->parse( '<script>foo</script>bar' ), 'foo bar', 'set_striptags redefinition works' );
	10	$hs->eof;
	11
	12	is( $hs->parse( '<foo>foo</foo>bar' ), 'bar', 'set_striptags redefinition works' );
	13	$hs->eof;
	14	}
	15
	16	{
	17	# set_striptags( LIST )
	18	my @striptags = qw(baz quux);
	19	my $hs = HTML::Strip->new;
	20	$hs->set_striptags( @striptags );
	21
	22	is( $hs->parse( '<baz>fumble</baz>bar<quux>foo</quux>' ), 'bar', 'stripping user-defined tags ok' );
	23	$hs->eof;
	24
	25	is( $hs->parse( '<baz>fumble<quux/>foo</baz>bar' ), 'bar', 'stripping user-defined tags ok' );
	26	$hs->eof;
	27
	28	is( $hs->parse( '<foo> </foo> <bar> baz </bar>' ), ' baz ', 'stripping user-defined tags ok' );
	29	$hs->eof;
	30	}

+10

-0

t/whitespace-single-char.t less more

	0	use Test::More tests => 2;
	1
	2	BEGIN { use_ok 'HTML::Strip' }
	3
	4	# test for RT#19036
	5	{
	6	my $hs = HTML::Strip->new();
	7	is( $hs->parse( '<tr><td>01 May 2006</td><td>0</td><td>10</td></tr>' ), '01 May 2006 0 10', "whitespace single character bug" );
	8	$hs->eof;
	9	}

+0

-91

~~test.pl~~ less more

0		# Before `make install' is performed this script should be runnable with
1		# `make test'. After `make install' it should work as `perl test.pl'
2
3		#########################
4
5		# change 'tests => 1' to 'tests => last_test_to_print';
6
7		use Test;
8		BEGIN { plan tests => 17 };
9		use HTML::Strip;
10		ok(1); # If we made it this far, we're ok.
11
12		#########################
13
14		# Insert your test code below, the Test module is use()ed here so read
15		# its man page ( perldoc Test ) for help writing this test script.
16
17		my $hs = new HTML::Strip;
18
19		ok( $hs->parse( 'test' ), 'test' );
20		$hs->eof;
21
22		ok( $hs->parse( '<em>test</em>' ), 'test' );
23		$hs->eof;
24
25		ok( $hs->parse( 'foo<br>bar' ), 'foo bar' );
26		$hs->eof;
27
28		ok( $hs->parse( '<p align="center">test</p>' ), 'test' );
29		$hs->eof;
30
31		ok( $hs->parse( '<p align="center>test</p>' ), '' );
32		$hs->eof;
33
34		ok( $hs->parse( '<foo>bar' ), 'bar' );
35		ok( $hs->parse( '</foo>baz' ), ' baz' );
36		$hs->eof;
37
38		ok( $hs->parse( '<!-- <p>foo</p> bar -->baz' ), 'baz' );
39		$hs->eof;
40
41		ok( $hs->parse( '<img src="foo.gif" alt="a > b">bar' ), 'bar' );
42		$hs->eof;
43
44		ok( $hs->parse( '<script>if (a<b && a>c)</script>bar' ), 'bar' );
45		$hs->eof;
46
47		ok( $hs->parse( '<# just data #>bar' ), 'bar' );
48		$hs->eof;
49
50		#ok( $hs->parse( '<![INCLUDE CDATA [ >>>>>>>>>>>> ]]>bar' ), 'bar' );
51		#$hs->eof;
52
53		ok( $hs->parse( '<script>foo</script>bar' ), 'bar' );
54		$hs->eof;
55
56		my $html_entities_p = eval 'require HTML::Entities' ? '' : 'HTML::Entities not available';
57		skip( $html_entities_p, $hs->parse( '<foo>' ), '<foo>' );
58		$hs->eof;
59		skip( $html_entities_p, $hs->parse( '<foo>' ), '<foo>' );
60		$hs->eof;
61		$hs->set_decode_entities(0);
62		skip( $html_entities_p, $hs->parse( '<foo>' ), '<foo>' );
63		$hs->eof;
64		skip( $html_entities_p, $hs->parse( '<foo>' ), '<foo>' );
65		$hs->eof;
66
67
68		my $hs2 = new HTML::Strip;
69		$hs2->set_striptags( [ 'foo' ] );
70
71		ok( $hs2->parse( '<script>foo</script>bar' ), 'foo bar' );
72		$hs2->eof;
73
74		ok( $hs2->parse( '<foo>foo</foo>bar' ), 'bar' );
75		$hs2->eof;
76
77		ok( $hs->parse( '<script>foo</script>bar' ), 'bar' );
78		$hs->eof;
79
80		my @striptags = qw(baz quux);
81		$hs->set_striptags( @striptags );
82
83		ok( $hs->parse( '<baz>fumble</baz>bar<quux>foo</quux>' ), 'bar' );
84		$hs->eof;
85
86		ok( $hs->parse( '<baz>fumble<quux/>foo</baz>bar' ), 'bar' );
87		$hs->eof;
88
89		ok( $hs->parse( '<foo> </foo> <bar> baz </bar>' ), ' baz ' );
90		$hs->eof;