Codebase list golang-github-jaytaylor-html2text / 990da46
Import upstream version 0.0~git20190408.0.01ec452, md5 eaf728835f9afeb014fa23f4db014449 Debian Janitor 4 years ago
4 changed file(s) with 172 addition(s) and 16 deletion(s). Raw diff Collapse all Expand all
0 # Compiled Object files, Static and Dynamic libs (Shared Objects)
1 *.o
2 *.a
3 *.so
4
5 # Folders
6 _obj
7 _test
8
9 # Architecture specific extensions/prefixes
10 *.[568vq]
11 [568vq].out
12
13 *.cgo1.go
14 *.cgo2.c
15 _cgo_defun.c
16 _cgo_gotypes.go
17 _cgo_export.*
18
19 _testmain.go
20
21 *.exe
22 *.test
23 *.prof
33 [![Build Status](https://travis-ci.org/jaytaylor/html2text.svg?branch=master)](https://travis-ci.org/jaytaylor/html2text)
44 [![Report Card](https://goreportcard.com/badge/github.com/jaytaylor/html2text)](https://goreportcard.com/report/github.com/jaytaylor/html2text)
55
6 ### Converts HTML into text
6 ### Converts HTML into text of the markdown-flavored variety
77
88
99 ## Introduction
1010
1111 Ensure your emails are readable by all!
1212
13 Turns HTML into raw text, useful for sending fancy HTML emails with a equivalently nicely formatted TXT document as a fallback (e.g. for people who don't allow HTML emails or have other display issues).
13 Turns HTML into raw text, useful for sending fancy HTML emails with an equivalently nicely formatted TXT document as a fallback (e.g. for people who don't allow HTML emails or have other display issues).
1414
1515 html2text is a simple golang package for rendering HTML into plaintext.
1616
2222 ## Download the package
2323
2424 ```bash
25 go get github.com/jaytaylor/html2text
25 go get jaytaylor.com/html2text
2626 ```
2727
2828 ## Example usage
3333 import (
3434 "fmt"
3535
36 "github.com/jaytaylor/html2text"
36 "jaytaylor.com/html2text"
3737 )
3838
3939 func main() {
7777 </body>
7878 </html>`
7979
80 text, err := FromString(inputHTML, Options{PrettyTables: true})
80 text, err := html2text.FromString(inputHTML, html2text.Options{PrettyTables: true})
8181 if err != nil {
8282 panic(err)
8383 }
1414
1515 // Options provide toggles and overrides to control specific rendering behaviors.
1616 type Options struct {
17 PrettyTables bool // Turns on pretty ASCII rendering for table elements.
17 PrettyTables bool // Turns on pretty ASCII rendering for table elements.
18 PrettyTablesOptions *PrettyTablesOptions // Configures pretty ASCII rendering for table elements.
19 OmitLinks bool // Turns on omitting links
20 }
21
22 // PrettyTablesOptions overrides tablewriter behaviors
23 type PrettyTablesOptions struct {
24 AutoFormatHeader bool
25 AutoWrapText bool
26 ReflowDuringAutoWrap bool
27 ColWidth int
28 ColumnSeparator string
29 RowSeparator string
30 CenterSeparator string
31 HeaderAlignment int
32 FooterAlignment int
33 Alignment int
34 ColumnAlignment []int
35 NewLine string
36 HeaderLine bool
37 RowLine bool
38 AutoMergeCells bool
39 Borders tablewriter.Border
40 }
41
42 // NewPrettyTablesOptions creates PrettyTablesOptions with default settings
43 func NewPrettyTablesOptions() *PrettyTablesOptions {
44 return &PrettyTablesOptions{
45 AutoFormatHeader: true,
46 AutoWrapText: true,
47 ReflowDuringAutoWrap: true,
48 ColWidth: tablewriter.MAX_ROW_WIDTH,
49 ColumnSeparator: tablewriter.COLUMN,
50 RowSeparator: tablewriter.ROW,
51 CenterSeparator: tablewriter.CENTER,
52 HeaderAlignment: tablewriter.ALIGN_DEFAULT,
53 FooterAlignment: tablewriter.ALIGN_DEFAULT,
54 Alignment: tablewriter.ALIGN_DEFAULT,
55 ColumnAlignment: []int{},
56 NewLine: tablewriter.NEWLINE,
57 HeaderLine: true,
58 RowLine: false,
59 AutoMergeCells: false,
60 Borders: tablewriter.Border{Left: true, Right: true, Bottom: true, Top: true},
61 }
1862 }
1963
2064 // FromHTMLNode renders text output from a pre-parsed HTML document.
78122 justClosedDiv bool
79123 blockquoteLevel int
80124 lineLength int
125 isPre bool
81126 }
82127
83128 // tableTraverseContext holds table ASCII-form related context.
208253 if attrVal := getAttrVal(node, "href"); attrVal != "" {
209254 attrVal = ctx.normalizeHrefLink(attrVal)
210255 // Don't print link href if it matches link element content or if the link is empty.
211 if attrVal != "" && linkText != attrVal {
256 if !ctx.options.OmitLinks && attrVal != "" && linkText != attrVal {
212257 hrefLink = "( " + attrVal + " )"
213258 }
214259 }
226271 }
227272 return ctx.traverseChildren(node)
228273
274 case atom.Pre:
275 ctx.isPre = true
276 err := ctx.traverseChildren(node)
277 ctx.isPre = false
278 return err
279
229280 case atom.Style, atom.Script, atom.Head:
230281 // Ignore the subtree.
231282 return nil
268319
269320 buf := &bytes.Buffer{}
270321 table := tablewriter.NewWriter(buf)
322 if ctx.options.PrettyTablesOptions != nil {
323 options := ctx.options.PrettyTablesOptions
324 table.SetAutoFormatHeaders(options.AutoFormatHeader)
325 table.SetAutoWrapText(options.AutoWrapText)
326 table.SetReflowDuringAutoWrap(options.ReflowDuringAutoWrap)
327 table.SetColWidth(options.ColWidth)
328 table.SetColumnSeparator(options.ColumnSeparator)
329 table.SetRowSeparator(options.RowSeparator)
330 table.SetCenterSeparator(options.CenterSeparator)
331 table.SetHeaderAlignment(options.HeaderAlignment)
332 table.SetFooterAlignment(options.FooterAlignment)
333 table.SetAlignment(options.Alignment)
334 table.SetColumnAlignment(options.ColumnAlignment)
335 table.SetNewLine(options.NewLine)
336 table.SetHeaderLine(options.HeaderLine)
337 table.SetRowLine(options.RowLine)
338 table.SetAutoMergeCells(options.AutoMergeCells)
339 table.SetBorders(options.Borders)
340 }
271341 table.SetHeader(ctx.tableCtx.header)
272342 table.SetFooter(ctx.tableCtx.footer)
273343 table.AppendBulk(ctx.tableCtx.body)
324394 return ctx.traverseChildren(node)
325395
326396 case html.TextNode:
327 data := strings.Trim(spacingRe.ReplaceAllString(node.Data, " "), " ")
397 var data string
398 if ctx.isPre {
399 data = node.Data
400 } else {
401 data = strings.TrimSpace(spacingRe.ReplaceAllString(node.Data, " "))
402 }
328403 return ctx.emit(data)
329404
330405 case html.ElementNode:
138138 "Test text<br><BR />Test text",
139139 "Test text\n\nTest text",
140140 },
141 {
142 "<pre>test1\ntest 2\n\ntest 3</pre>",
143 "test1\ntest 2\n\ntest 3",
144 },
141145 }
142146
143147 for _, testCase := range testCases {
184188 {
185189 `<table>
186190 <tbody>
187 <tr><td><p>Row-1-Col-1-Msg1</p><p>Row-1-Col-1-Msg2</p></td><td>Row-1-Col-2</td></tr>
191 <tr><td><p>Row-1-Col-1-Msg123456789012345</p><p>Row-1-Col-1-Msg2</p></td><td>Row-1-Col-2</td></tr>
188192 <tr><td>Row-2-Col-1</td><td>Row-2-Col-2</td></tr>
189193 </tbody>
190194 </table>`,
191195 // +--------------------------------+-------------+
192 // | Row-1-Col-1-Msg1 | Row-1-Col-2 |
196 // | Row-1-Col-1-Msg123456789012345 | Row-1-Col-2 |
193197 // | Row-1-Col-1-Msg2 | |
194198 // | Row-2-Col-1 | Row-2-Col-2 |
195199 // +--------------------------------+-------------+
196200 `+--------------------------------+-------------+
197 | Row-1-Col-1-Msg1 | Row-1-Col-2 |
201 | Row-1-Col-1-Msg123456789012345 | Row-1-Col-2 |
198202 | Row-1-Col-1-Msg2 | |
199203 | Row-2-Col-1 | Row-2-Col-2 |
200204 +--------------------------------+-------------+`,
201 `Row-1-Col-1-Msg1
205 `Row-1-Col-1-Msg123456789012345
202206
203207 Row-1-Col-1-Msg2
204208
240244 "Header 1 Header 2 Footer 1 Footer 2 Row 1 Col 1 Row 1 Col 2 Row 2 Col 1 Row 2 Col 2",
241245 },
242246 // Two tables in same HTML (goal is to test that context is
243 // reinitalized correctly).
247 // reinitialized correctly).
244248 {
245249 `<p>
246250 <table>
329333
330334 for _, testCase := range testCases {
331335 options := Options{
332 PrettyTables: true,
336 PrettyTables: true,
337 PrettyTablesOptions: NewPrettyTablesOptions(),
333338 }
334339 // Check pretty tabular ASCII version.
335340 if msg, err := wantString(testCase.input, testCase.tabularOutput, options); err != nil {
467472 }
468473 }
469474
470 func TestImageAltTags(t *testing.T) {
475 func TestOmitLinks(t *testing.T) {
471476 testCases := []struct {
472477 input string
473478 output string
474479 }{
475480 {
481 `<a></a>`,
482 ``,
483 },
484 {
485 `<a href=""></a>`,
486 ``,
487 },
488 {
489 `<a href="http://example.com/"></a>`,
490 ``,
491 },
492 {
493 `<a href="">Link</a>`,
494 `Link`,
495 },
496 {
497 `<a href="http://example.com/">Link</a>`,
498 `Link`,
499 },
500 {
501 `<a href="http://example.com/"><span class="a">Link</span></a>`,
502 `Link`,
503 },
504 {
505 "<a href='http://example.com/'>\n\t<span class='a'>Link</span>\n\t</a>",
506 `Link`,
507 },
508 {
509 `<a href="http://example.com/"><img src="http://example.ru/hello.jpg" alt="Example"></a>`,
510 `Example`,
511 },
512 }
513
514 for _, testCase := range testCases {
515 if msg, err := wantString(testCase.input, testCase.output, Options{OmitLinks: true}); err != nil {
516 t.Error(err)
517 } else if len(msg) > 0 {
518 t.Log(msg)
519 }
520 }
521 }
522
523 func TestImageAltTags(t *testing.T) {
524 testCases := []struct {
525 input string
526 output string
527 }{
528 {
476529 `<img />`,
477530 ``,
478531 },
614667 {
615668 "Test 1<div>Test 2</div> <div>Test 3</div>Test 4",
616669 "Test 1\nTest 2\nTest 3\nTest 4",
670 },
671 {
672 "Test 1<div>&nbsp;Test 2&nbsp;</div>",
673 "Test 1\nTest 2",
617674 },
618675 }
619676
743800 `hi
744801
745802 <br>
746
803
747804 hello <a href="https://google.com">google</a>
748805 <br><br>
749806 test<p>List:</p>