Codebase list golang-github-jaytaylor-html2text / 24afd27
New upstream version 0.0~git20170217.0.24f9b0f Michael Lustfield 7 years ago
5 changed file(s) with 1086 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
0 language: go
1 go:
2 - tip
3 - 1.8
4 - 1.7
5 - 1.6
6 - 1.5
7 - 1.4
8 - 1.3
9 - 1.2
10 notifications:
11 email:
12 on_success: change
13 on_failure: always
0 The MIT License (MIT)
1
2 Copyright (c) 2015 Jay Taylor
3
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10
11 The above copyright notice and this permission notice shall be included in all
12 copies or substantial portions of the Software.
13
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 SOFTWARE.
21
0 # html2text
1
2 [![Documentation](https://godoc.org/github.com/jaytaylor/html2text?status.svg)](https://godoc.org/github.com/jaytaylor/html2text)
3 [![Build Status](https://travis-ci.org/jaytaylor/html2text.svg?branch=master)](https://travis-ci.org/jaytaylor/html2text)
4 [![Report Card](https://goreportcard.com/badge/github.com/jaytaylor/html2text)](https://goreportcard.com/report/github.com/jaytaylor/html2text)
5
6 ### Converts HTML into text
7
8
9 ## Introduction
10
11 Ensure your emails are readable by all!
12
13 Turns HTML into raw text, useful for sending fancy HTML emails with a equivalently nicely formatted TXT document as a fallback (e.g. for people who don't allow HTML emails or have other display issues).
14
15 html2text is a simple golang package for rendering HTML into plaintext.
16
17 There are still lots of improvements to be had, but FWIW this has worked fine for my [basic] HTML-2-text needs.
18
19 It requires go 1.x or newer ;)
20
21
22 ## Download the package
23
24 ```bash
25 go get github.com/jaytaylor/html2text
26 ```
27
28 ## Example usage
29
30 ```go
31 package main
32
33 import (
34 "fmt"
35
36 "github.com/jaytaylor/html2text"
37 )
38
39 func main() {
40 inputHtml := `
41 <html>
42 <head>
43 <title>My Mega Service</title>
44 <link rel=\"stylesheet\" href=\"main.css\">
45 <style type=\"text/css\">body { color: #fff; }</style>
46 </head>
47
48 <body>
49 <div class="logo">
50 <a href="http://mymegaservice.com/"><img src="/logo-image.jpg" alt="Mega Service"/></a>
51 </div>
52
53 <h1>Welcome to your new account on my service!</h1>
54
55 <p>
56 Here is some more information:
57
58 <ul>
59 <li>Link 1: <a href="https://example.com">Example.com</a></li>
60 <li>Link 2: <a href="https://example2.com">Example2.com</a></li>
61 <li>Something else</li>
62 </ul>
63 </p>
64 </body>
65 </html>
66 `
67
68 text, err := html2text.FromString(inputHtml)
69 if err != nil {
70 panic(err)
71 }
72 fmt.Println(text)
73 }
74 ```
75
76 Output:
77 ```
78 Mega Service ( http://mymegaservice.com/ )
79
80 ******************************************
81 Welcome to your new account on my service!
82 ******************************************
83
84 Here is some more information:
85
86 * Link 1: Example.com ( https://example.com )
87 * Link 2: Example2.com ( https://example2.com )
88 * Something else
89 ```
90
91
92 ## Unit-tests
93
94 Running the unit-tests is straightforward and standard:
95
96 ```bash
97 go test
98 ```
99
100
101 # License
102
103 Permissive MIT license.
104
105
106 ## Contact
107
108 You are more than welcome to open issues and send pull requests if you find a bug or want a new feature.
109
110 If you appreciate this library please feel free to drop me a line and tell me! It's always nice to hear from people who have benefitted from my work.
111
112 Email: jay at (my github username).com
113
114 Twitter: [@jtaylor](https://twitter.com/jtaylor)
115
0 package html2text
1
2 import (
3 "bytes"
4 "io"
5 "regexp"
6 "strings"
7 "unicode"
8
9 "golang.org/x/net/html"
10 "golang.org/x/net/html/atom"
11 )
12
13 var (
14 spacingRe = regexp.MustCompile(`[ \r\n\t]+`)
15 newlineRe = regexp.MustCompile(`\n\n+`)
16 )
17
18 type textifyTraverseCtx struct {
19 Buf bytes.Buffer
20
21 prefix string
22 blockquoteLevel int
23 lineLength int
24 endsWithSpace bool
25 endsWithNewline bool
26 justClosedDiv bool
27 }
28
29 func (ctx *textifyTraverseCtx) traverse(node *html.Node) error {
30 switch node.Type {
31
32 default:
33 return ctx.traverseChildren(node)
34
35 case html.TextNode:
36 data := strings.Trim(spacingRe.ReplaceAllString(node.Data, " "), " ")
37 return ctx.emit(data)
38
39 case html.ElementNode:
40
41 ctx.justClosedDiv = false
42 switch node.DataAtom {
43 case atom.Br:
44 return ctx.emit("\n")
45
46 case atom.H1, atom.H2, atom.H3:
47 subCtx := textifyTraverseCtx{}
48 if err := subCtx.traverseChildren(node); err != nil {
49 return err
50 }
51
52 str := subCtx.Buf.String()
53 dividerLen := 0
54 for _, line := range strings.Split(str, "\n") {
55 if lineLen := len([]rune(line)); lineLen-1 > dividerLen {
56 dividerLen = lineLen - 1
57 }
58 }
59 divider := ""
60 if node.DataAtom == atom.H1 {
61 divider = strings.Repeat("*", dividerLen)
62 } else {
63 divider = strings.Repeat("-", dividerLen)
64 }
65
66 if node.DataAtom == atom.H3 {
67 return ctx.emit("\n\n" + str + "\n" + divider + "\n\n")
68 }
69 return ctx.emit("\n\n" + divider + "\n" + str + "\n" + divider + "\n\n")
70
71 case atom.Blockquote:
72 ctx.blockquoteLevel++
73 ctx.prefix = strings.Repeat(">", ctx.blockquoteLevel) + " "
74 if err := ctx.emit("\n"); err != nil {
75 return err
76 }
77 if ctx.blockquoteLevel == 1 {
78 if err := ctx.emit("\n"); err != nil {
79 return err
80 }
81 }
82 if err := ctx.traverseChildren(node); err != nil {
83 return err
84 }
85 ctx.blockquoteLevel--
86 ctx.prefix = strings.Repeat(">", ctx.blockquoteLevel)
87 if ctx.blockquoteLevel > 0 {
88 ctx.prefix += " "
89 }
90 return ctx.emit("\n\n")
91
92 case atom.Div:
93 if ctx.lineLength > 0 {
94 if err := ctx.emit("\n"); err != nil {
95 return err
96 }
97 }
98 if err := ctx.traverseChildren(node); err != nil {
99 return err
100 }
101 var err error
102 if ctx.justClosedDiv == false {
103 err = ctx.emit("\n")
104 }
105 ctx.justClosedDiv = true
106 return err
107
108 case atom.Li:
109 if err := ctx.emit("* "); err != nil {
110 return err
111 }
112
113 if err := ctx.traverseChildren(node); err != nil {
114 return err
115 }
116
117 return ctx.emit("\n")
118
119 case atom.B, atom.Strong:
120 subCtx := textifyTraverseCtx{}
121 subCtx.endsWithSpace = true
122 if err := subCtx.traverseChildren(node); err != nil {
123 return err
124 }
125 str := subCtx.Buf.String()
126 return ctx.emit("*" + str + "*")
127
128 case atom.A:
129 // If image is the only child, take its alt text as the link text
130 if img := node.FirstChild; img != nil && node.LastChild == img && img.DataAtom == atom.Img {
131 if altText := getAttrVal(img, "alt"); altText != "" {
132 ctx.emit(altText)
133 }
134 } else if err := ctx.traverseChildren(node); err != nil {
135 return err
136 }
137
138 hrefLink := ""
139 if attrVal := getAttrVal(node, "href"); attrVal != "" {
140 attrVal = ctx.normalizeHrefLink(attrVal)
141 if attrVal != "" {
142 hrefLink = "( " + attrVal + " )"
143 }
144 }
145
146 return ctx.emit(hrefLink)
147
148 case atom.P, atom.Ul, atom.Table:
149 if err := ctx.emit("\n\n"); err != nil {
150 return err
151 }
152
153 if err := ctx.traverseChildren(node); err != nil {
154 return err
155 }
156
157 return ctx.emit("\n\n")
158
159 case atom.Tr:
160 if err := ctx.traverseChildren(node); err != nil {
161 return err
162 }
163
164 return ctx.emit("\n")
165
166 case atom.Style, atom.Script, atom.Head:
167 // Ignore the subtree
168 return nil
169
170 default:
171 return ctx.traverseChildren(node)
172 }
173 }
174 }
175
176 func (ctx *textifyTraverseCtx) traverseChildren(node *html.Node) error {
177 for c := node.FirstChild; c != nil; c = c.NextSibling {
178 if err := ctx.traverse(c); err != nil {
179 return err
180 }
181 }
182
183 return nil
184 }
185
186 func (ctx *textifyTraverseCtx) emit(data string) error {
187 if len(data) == 0 {
188 return nil
189 }
190 lines := ctx.breakLongLines(data)
191 var err error
192 for _, line := range lines {
193 runes := []rune(line)
194 startsWithSpace := unicode.IsSpace(runes[0])
195 if !startsWithSpace && !ctx.endsWithSpace {
196 ctx.Buf.WriteByte(' ')
197 ctx.lineLength++
198 }
199 ctx.endsWithSpace = unicode.IsSpace(runes[len(runes)-1])
200 for _, c := range line {
201 _, err = ctx.Buf.WriteString(string(c))
202 if err != nil {
203 return err
204 }
205 ctx.lineLength++
206 if c == '\n' {
207 ctx.lineLength = 0
208 if ctx.prefix != "" {
209 _, err = ctx.Buf.WriteString(ctx.prefix)
210 if err != nil {
211 return err
212 }
213 }
214 }
215 }
216 }
217 return nil
218 }
219
220 func (ctx *textifyTraverseCtx) breakLongLines(data string) []string {
221 // only break lines when we are in blockquotes
222 if ctx.blockquoteLevel == 0 {
223 return []string{data}
224 }
225 var ret []string
226 runes := []rune(data)
227 l := len(runes)
228 existing := ctx.lineLength
229 if existing >= 74 {
230 ret = append(ret, "\n")
231 existing = 0
232 }
233 for l+existing > 74 {
234 i := 74 - existing
235 for i >= 0 && !unicode.IsSpace(runes[i]) {
236 i--
237 }
238 if i == -1 {
239 // no spaces, so go the other way
240 i = 74 - existing
241 for i < l && !unicode.IsSpace(runes[i]) {
242 i++
243 }
244 }
245 ret = append(ret, string(runes[:i])+"\n")
246 for i < l && unicode.IsSpace(runes[i]) {
247 i++
248 }
249 runes = runes[i:]
250 l = len(runes)
251 existing = 0
252 }
253 if len(runes) > 0 {
254 ret = append(ret, string(runes))
255 }
256 return ret
257 }
258
259 func (ctx *textifyTraverseCtx) normalizeHrefLink(link string) string {
260 link = strings.TrimSpace(link)
261 link = strings.TrimPrefix(link, "mailto:")
262 return link
263 }
264
265 func getAttrVal(node *html.Node, attrName string) string {
266 for _, attr := range node.Attr {
267 if attr.Key == attrName {
268 return attr.Val
269 }
270 }
271
272 return ""
273 }
274
275 func FromHtmlNode(doc *html.Node) (string, error) {
276 ctx := textifyTraverseCtx{
277 Buf: bytes.Buffer{},
278 }
279 if err := ctx.traverse(doc); err != nil {
280 return "", err
281 }
282
283 text := strings.TrimSpace(newlineRe.ReplaceAllString(
284 strings.Replace(ctx.Buf.String(), "\n ", "\n", -1), "\n\n"))
285 return text, nil
286
287 }
288
289 func FromReader(reader io.Reader) (string, error) {
290 doc, err := html.Parse(reader)
291 if err != nil {
292 return "", err
293 }
294 return FromHtmlNode(doc)
295 }
296
297 func FromString(input string) (string, error) {
298 text, err := FromReader(strings.NewReader(input))
299 if err != nil {
300 return "", err
301 }
302 return text, nil
303 }
0 package html2text
1
2 import (
3 "fmt"
4 "regexp"
5 "testing"
6 )
7
8 func TestStrippingWhitespace(t *testing.T) {
9 testCases := []struct {
10 input string
11 output string
12 }{
13 {
14 "test text",
15 "test text",
16 },
17 {
18 " \ttext\ntext\n",
19 "text text",
20 },
21 {
22 " \na \n\t \n \n a \t",
23 "a a",
24 },
25 {
26 "test text",
27 "test text",
28 },
29 {
30 "test&nbsp;&nbsp;&nbsp; text&nbsp;",
31 "test    text",
32 },
33 }
34
35 for _, testCase := range testCases {
36 assertString(t, testCase.input, testCase.output)
37 }
38 }
39
40 func TestParagraphsAndBreaks(t *testing.T) {
41 testCases := []struct {
42 input string
43 output string
44 }{
45 {
46 "Test text",
47 "Test text",
48 },
49 {
50 "Test text<br>",
51 "Test text",
52 },
53 {
54 "Test text<br>Test",
55 "Test text\nTest",
56 },
57 {
58 "<p>Test text</p>",
59 "Test text",
60 },
61 {
62 "<p>Test text</p><p>Test text</p>",
63 "Test text\n\nTest text",
64 },
65 {
66 "\n<p>Test text</p>\n\n\n\t<p>Test text</p>\n",
67 "Test text\n\nTest text",
68 },
69 {
70 "\n<p>Test text<br/>Test text</p>\n",
71 "Test text\nTest text",
72 },
73 {
74 "\n<p>Test text<br> \tTest text<br></p>\n",
75 "Test text\nTest text",
76 },
77 {
78 "Test text<br><BR />Test text",
79 "Test text\n\nTest text",
80 },
81 }
82
83 for _, testCase := range testCases {
84 assertString(t, testCase.input, testCase.output)
85 }
86 }
87
88 func TestTables(t *testing.T) {
89 testCases := []struct {
90 input string
91 output string
92 }{
93 {
94 "<table><tr><td></td><td></td></tr></table>",
95 "",
96 },
97 {
98 "<table><tr><td>cell1</td><td>cell2</td></tr></table>",
99 "cell1 cell2",
100 },
101 {
102 "<table><tr><td>row1</td></tr><tr><td>row2</td></tr></table>",
103 "row1\nrow2",
104 },
105 {
106 `<table>
107 <tr><td>cell1-1</td><td>cell1-2</td></tr>
108 <tr><td>cell2-1</td><td>cell2-2</td></tr>
109 </table>`,
110 "cell1-1 cell1-2\ncell2-1 cell2-2",
111 },
112 {
113 "_<table><tr><td>cell</td></tr></table>_",
114 "_\n\ncell\n\n_",
115 },
116 }
117
118 for _, testCase := range testCases {
119 assertString(t, testCase.input, testCase.output)
120 }
121 }
122
123 func TestStrippingLists(t *testing.T) {
124 testCases := []struct {
125 input string
126 output string
127 }{
128 {
129 "<ul></ul>",
130 "",
131 },
132 {
133 "<ul><li>item</li></ul>_",
134 "* item\n\n_",
135 },
136 {
137 "<li class='123'>item 1</li> <li>item 2</li>\n_",
138 "* item 1\n* item 2\n_",
139 },
140 {
141 "<li>item 1</li> \t\n <li>item 2</li> <li> item 3</li>\n_",
142 "* item 1\n* item 2\n* item 3\n_",
143 },
144 }
145
146 for _, testCase := range testCases {
147 assertString(t, testCase.input, testCase.output)
148 }
149 }
150
151 func TestLinks(t *testing.T) {
152 testCases := []struct {
153 input string
154 output string
155 }{
156 {
157 `<a></a>`,
158 ``,
159 },
160 {
161 `<a href=""></a>`,
162 ``,
163 },
164 {
165 `<a href="http://example.com/"></a>`,
166 `( http://example.com/ )`,
167 },
168 {
169 `<a href="">Link</a>`,
170 `Link`,
171 },
172 {
173 `<a href="http://example.com/">Link</a>`,
174 `Link ( http://example.com/ )`,
175 },
176 {
177 `<a href="http://example.com/"><span class="a">Link</span></a>`,
178 `Link ( http://example.com/ )`,
179 },
180 {
181 "<a href='http://example.com/'>\n\t<span class='a'>Link</span>\n\t</a>",
182 `Link ( http://example.com/ )`,
183 },
184 {
185 "<a href='mailto:contact@example.org'>Contact Us</a>",
186 `Contact Us ( contact@example.org )`,
187 },
188 {
189 "<a href=\"http://example.com:80/~user?aaa=bb&amp;c=d,e,f#foo\">Link</a>",
190 `Link ( http://example.com:80/~user?aaa=bb&c=d,e,f#foo )`,
191 },
192 {
193 "<a title='title' href=\"http://example.com/\">Link</a>",
194 `Link ( http://example.com/ )`,
195 },
196 {
197 "<a href=\" http://example.com/ \"> Link </a>",
198 `Link ( http://example.com/ )`,
199 },
200 {
201 "<a href=\"http://example.com/a/\">Link A</a> <a href=\"http://example.com/b/\">Link B</a>",
202 `Link A ( http://example.com/a/ ) Link B ( http://example.com/b/ )`,
203 },
204 {
205 "<a href=\"%%LINK%%\">Link</a>",
206 `Link ( %%LINK%% )`,
207 },
208 {
209 "<a href=\"[LINK]\">Link</a>",
210 `Link ( [LINK] )`,
211 },
212 {
213 "<a href=\"{LINK}\">Link</a>",
214 `Link ( {LINK} )`,
215 },
216 {
217 "<a href=\"[[!unsubscribe]]\">Link</a>",
218 `Link ( [[!unsubscribe]] )`,
219 },
220 {
221 "<p>This is <a href=\"http://www.google.com\" >link1</a> and <a href=\"http://www.google.com\" >link2 </a> is next.</p>",
222 `This is link1 ( http://www.google.com ) and link2 ( http://www.google.com ) is next.`,
223 },
224 }
225
226 for _, testCase := range testCases {
227 assertString(t, testCase.input, testCase.output)
228 }
229 }
230
231 func TestImageAltTags(t *testing.T) {
232 testCases := []struct {
233 input string
234 output string
235 }{
236 {
237 `<img />`,
238 ``,
239 },
240 {
241 `<img src="http://example.ru/hello.jpg" />`,
242 ``,
243 },
244 {
245 `<img alt="Example"/>`,
246 ``,
247 },
248 {
249 `<img src="http://example.ru/hello.jpg" alt="Example"/>`,
250 ``,
251 },
252 // Images do matter if they are in a link
253 {
254 `<a href="http://example.com/"><img src="http://example.ru/hello.jpg" alt="Example"/></a>`,
255 `Example ( http://example.com/ )`,
256 },
257 {
258 `<a href="http://example.com/"><img src="http://example.ru/hello.jpg" alt="Example"></a>`,
259 `Example ( http://example.com/ )`,
260 },
261 {
262 `<a href='http://example.com/'><img src='http://example.ru/hello.jpg' alt='Example'/></a>`,
263 `Example ( http://example.com/ )`,
264 },
265 {
266 `<a href='http://example.com/'><img src='http://example.ru/hello.jpg' alt='Example'></a>`,
267 `Example ( http://example.com/ )`,
268 },
269 }
270
271 for _, testCase := range testCases {
272 assertString(t, testCase.input, testCase.output)
273 }
274 }
275
276 func TestHeadings(t *testing.T) {
277 testCases := []struct {
278 input string
279 output string
280 }{
281 {
282 "<h1>Test</h1>",
283 "****\nTest\n****",
284 },
285 {
286 "\t<h1>\nTest</h1> ",
287 "****\nTest\n****",
288 },
289 {
290 "\t<h1>\nTest line 1<br>Test 2</h1> ",
291 "***********\nTest line 1\nTest 2\n***********",
292 },
293 {
294 "<h1>Test</h1> <h1>Test</h1>",
295 "****\nTest\n****\n\n****\nTest\n****",
296 },
297 {
298 "<h2>Test</h2>",
299 "----\nTest\n----",
300 },
301 {
302 "<h1><a href='http://example.com/'>Test</a></h1>",
303 "****************************\nTest ( http://example.com/ )\n****************************",
304 },
305 {
306 "<h3> <span class='a'>Test </span></h3>",
307 "Test\n----",
308 },
309 }
310
311 for _, testCase := range testCases {
312 assertString(t, testCase.input, testCase.output)
313 }
314
315 }
316
317 func TestBold(t *testing.T) {
318 testCases := []struct {
319 input string
320 output string
321 }{
322 {
323 "<b>Test</b>",
324 "*Test*",
325 },
326 {
327 "\t<b>Test</b> ",
328 "*Test*",
329 },
330 {
331 "\t<b>Test line 1<br>Test 2</b> ",
332 "*Test line 1\nTest 2*",
333 },
334 {
335 "<b>Test</b> <b>Test</b>",
336 "*Test* *Test*",
337 },
338 }
339
340 for _, testCase := range testCases {
341 assertString(t, testCase.input, testCase.output)
342 }
343
344 }
345
346 func TestDiv(t *testing.T) {
347 testCases := []struct {
348 input string
349 output string
350 }{
351 {
352 "<div>Test</div>",
353 "Test",
354 },
355 {
356 "\t<div>Test</div> ",
357 "Test",
358 },
359 {
360 "<div>Test line 1<div>Test 2</div></div>",
361 "Test line 1\nTest 2",
362 },
363 {
364 "Test 1<div>Test 2</div> <div>Test 3</div>Test 4",
365 "Test 1\nTest 2\nTest 3\nTest 4",
366 },
367 }
368
369 for _, testCase := range testCases {
370 assertString(t, testCase.input, testCase.output)
371 }
372
373 }
374
375 func TestBlockquotes(t *testing.T) {
376 testCases := []struct {
377 input string
378 output string
379 }{
380 {
381 "<div>level 0<blockquote>level 1<br><blockquote>level 2</blockquote>level 1</blockquote><div>level 0</div></div>",
382 "level 0\n> \n> level 1\n> \n>> level 2\n> \n> level 1\n\nlevel 0",
383 },
384 {
385 "<blockquote>Test</blockquote>Test",
386 "> \n> Test\n\nTest",
387 },
388 {
389 "\t<blockquote> \nTest<br></blockquote> ",
390 "> \n> Test\n>",
391 },
392 {
393 "\t<blockquote> \nTest line 1<br>Test 2</blockquote> ",
394 "> \n> Test line 1\n> Test 2",
395 },
396 {
397 "<blockquote>Test</blockquote> <blockquote>Test</blockquote> Other Test",
398 "> \n> Test\n\n> \n> Test\n\nOther Test",
399 },
400 {
401 "<blockquote>Lorem ipsum Commodo id consectetur pariatur ea occaecat minim aliqua ad sit consequat quis ex commodo Duis incididunt eu mollit consectetur fugiat voluptate dolore in pariatur in commodo occaecat Ut occaecat velit esse labore aute quis commodo non sit dolore officia Excepteur cillum amet cupidatat culpa velit labore ullamco dolore mollit elit in aliqua dolor irure do</blockquote>",
402 "> \n> Lorem ipsum Commodo id consectetur pariatur ea occaecat minim aliqua ad\n> sit consequat quis ex commodo Duis incididunt eu mollit consectetur fugiat\n> voluptate dolore in pariatur in commodo occaecat Ut occaecat velit esse\n> labore aute quis commodo non sit dolore officia Excepteur cillum amet\n> cupidatat culpa velit labore ullamco dolore mollit elit in aliqua dolor\n> irure do",
403 },
404 {
405 "<blockquote>Lorem<b>ipsum</b><b>Commodo</b><b>id</b><b>consectetur</b><b>pariatur</b><b>ea</b><b>occaecat</b><b>minim</b><b>aliqua</b><b>ad</b><b>sit</b><b>consequat</b><b>quis</b><b>ex</b><b>commodo</b><b>Duis</b><b>incididunt</b><b>eu</b><b>mollit</b><b>consectetur</b><b>fugiat</b><b>voluptate</b><b>dolore</b><b>in</b><b>pariatur</b><b>in</b><b>commodo</b><b>occaecat</b><b>Ut</b><b>occaecat</b><b>velit</b><b>esse</b><b>labore</b><b>aute</b><b>quis</b><b>commodo</b><b>non</b><b>sit</b><b>dolore</b><b>officia</b><b>Excepteur</b><b>cillum</b><b>amet</b><b>cupidatat</b><b>culpa</b><b>velit</b><b>labore</b><b>ullamco</b><b>dolore</b><b>mollit</b><b>elit</b><b>in</b><b>aliqua</b><b>dolor</b><b>irure</b><b>do</b></blockquote>",
406 "> \n> Lorem *ipsum* *Commodo* *id* *consectetur* *pariatur* *ea* *occaecat* *minim*\n> *aliqua* *ad* *sit* *consequat* *quis* *ex* *commodo* *Duis* *incididunt* *eu*\n> *mollit* *consectetur* *fugiat* *voluptate* *dolore* *in* *pariatur* *in* *commodo*\n> *occaecat* *Ut* *occaecat* *velit* *esse* *labore* *aute* *quis* *commodo*\n> *non* *sit* *dolore* *officia* *Excepteur* *cillum* *amet* *cupidatat* *culpa*\n> *velit* *labore* *ullamco* *dolore* *mollit* *elit* *in* *aliqua* *dolor* *irure*\n> *do*",
407 },
408 }
409
410 for _, testCase := range testCases {
411 assertString(t, testCase.input, testCase.output)
412 }
413
414 }
415
416 func TestIgnoreStylesScriptsHead(t *testing.T) {
417 testCases := []struct {
418 input string
419 output string
420 }{
421 {
422 "<style>Test</style>",
423 "",
424 },
425 {
426 "<style type=\"text/css\">body { color: #fff; }</style>",
427 "",
428 },
429 {
430 "<link rel=\"stylesheet\" href=\"main.css\">",
431 "",
432 },
433 {
434 "<script>Test</script>",
435 "",
436 },
437 {
438 "<script src=\"main.js\"></script>",
439 "",
440 },
441 {
442 "<script type=\"text/javascript\" src=\"main.js\"></script>",
443 "",
444 },
445 {
446 "<script type=\"text/javascript\">Test</script>",
447 "",
448 },
449 {
450 "<script type=\"text/ng-template\" id=\"template.html\"><a href=\"http://google.com\">Google</a></script>",
451 "",
452 },
453 {
454 "<script type=\"bla-bla-bla\" id=\"template.html\">Test</script>",
455 "",
456 },
457 {
458 `<html><head><title>Title</title></head><body></body></html>`,
459 "",
460 },
461 }
462
463 for _, testCase := range testCases {
464 assertString(t, testCase.input, testCase.output)
465 }
466 }
467
468 func TestText(t *testing.T) {
469 testCases := []struct {
470 input string
471 expr string
472 }{
473 {
474 `<li>
475 <a href="/new" data-ga-click="Header, create new repository, icon:repo"><span class="octicon octicon-repo"></span> New repository</a>
476 </li>`,
477 `\* New repository \( /new \)`,
478 },
479 {
480 `hi
481
482 <br>
483
484 hello <a href="https://google.com">google</a>
485 <br><br>
486 test<p>List:</p>
487
488 <ul>
489 <li><a href="foo">Foo</a></li>
490 <li><a href="http://www.microshwhat.com/bar/soapy">Barsoap</a></li>
491 <li>Baz</li>
492 </ul>
493 `,
494 `hi
495 hello google \( https://google.com \)
496
497 test
498
499 List:
500
501 \* Foo \( foo \)
502 \* Barsoap \( http://www.microshwhat.com/bar/soapy \)
503 \* Baz`,
504 },
505 // Malformed input html.
506 {
507 `hi
508
509 hello <a href="https://google.com">google</a>
510
511 test<p>List:</p>
512
513 <ul>
514 <li><a href="foo">Foo</a>
515 <li><a href="/
516 bar/baz">Bar</a>
517 <li>Baz</li>
518 </ul>
519 `,
520 `hi hello google \( https://google.com \) test
521
522 List:
523
524 \* Foo \( foo \)
525 \* Bar \( /\n[ \t]+bar/baz \)
526 \* Baz`,
527 },
528 }
529
530 for _, testCase := range testCases {
531 assertRegexp(t, testCase.input, testCase.expr)
532 }
533 }
534
535 type StringMatcher interface {
536 MatchString(string) bool
537 String() string
538 }
539
540 type RegexpStringMatcher string
541
542 func (m RegexpStringMatcher) MatchString(str string) bool {
543 return regexp.MustCompile(string(m)).MatchString(str)
544 }
545 func (m RegexpStringMatcher) String() string {
546 return string(m)
547 }
548
549 type ExactStringMatcher string
550
551 func (m ExactStringMatcher) MatchString(str string) bool {
552 return string(m) == str
553 }
554 func (m ExactStringMatcher) String() string {
555 return string(m)
556 }
557
558 func assertRegexp(t *testing.T, input string, outputRE string) {
559 assertPlaintext(t, input, RegexpStringMatcher(outputRE))
560 }
561
562 func assertString(t *testing.T, input string, output string) {
563 assertPlaintext(t, input, ExactStringMatcher(output))
564 }
565
566 func assertPlaintext(t *testing.T, input string, matcher StringMatcher) {
567 text, err := FromString(input)
568 if err != nil {
569 t.Error(err)
570 }
571 if !matcher.MatchString(text) {
572 t.Errorf("Input did not match expression\n"+
573 "Input:\n>>>>\n%s\n<<<<\n\n"+
574 "Output:\n>>>>\n%s\n<<<<\n\n"+
575 "Expected output:\n>>>>\n%s\n<<<<\n\n",
576 input, text, matcher.String())
577 } else {
578 t.Logf("input:\n\n%s\n\n\n\noutput:\n\n%s\n", input, text)
579 }
580 }
581
582 func Example() {
583 inputHtml := `
584 <html>
585 <head>
586 <title>My Mega Service</title>
587 <link rel=\"stylesheet\" href=\"main.css\">
588 <style type=\"text/css\">body { color: #fff; }</style>
589 </head>
590
591 <body>
592 <div class="logo">
593 <a href="http://mymegaservice.com/"><img src="/logo-image.jpg" alt="Mega Service"/></a>
594 </div>
595
596 <h1>Welcome to your new account on my service!</h1>
597
598 <p>
599 Here is some more information:
600
601 <ul>
602 <li>Link 1: <a href="https://example.com">Example.com</a></li>
603 <li>Link 2: <a href="https://example2.com">Example2.com</a></li>
604 <li>Something else</li>
605 </ul>
606 </p>
607 </body>
608 </html>
609 `
610
611 text, err := FromString(inputHtml)
612 if err != nil {
613 panic(err)
614 }
615 fmt.Println(text)
616
617 // Output:
618 // Mega Service ( http://mymegaservice.com/ )
619 //
620 // ******************************************
621 // Welcome to your new account on my service!
622 // ******************************************
623 //
624 // Here is some more information:
625 //
626 // * Link 1: Example.com ( https://example.com )
627 // * Link 2: Example2.com ( https://example2.com )
628 // * Something else
629 }