Files
mattermost/vendor/github.com/advancedlogic/GoOse/goose.go
Jesús Espino 8d5be2d657 Document extractor service (#15665)
* Document extractor service

* Fixing vendor modules

* Addressing PR Review comments

* Some small simplifications

* Fixing a linter complain

* simplifying a bit the code using package variables

Co-authored-by: Mattermod <mattermod@users.noreply.github.com>
2020-10-27 15:58:38 +01:00

35 lines
894 B
Go

package goose
import (
"github.com/pkg/errors"
)
// Goose is the main entry point of the program
type Goose struct {
config Configuration
}
// New returns a new instance of the article extractor
func New(args ...string) Goose {
return Goose{
config: GetDefaultConfiguration(args...),
}
}
// ExtractFromURL follows the URL, fetches the HTML page and returns an article object
func (g Goose) ExtractFromURL(url string) (*Article, error) {
HtmlRequester := NewHtmlRequester(g.config)
html, err := HtmlRequester.fetchHTML(url)
if err != nil {
return nil, errors.Wrap(err, "could not get htnk from site")
}
cc := NewCrawler(g.config)
return cc.Crawl(html, url)
}
// ExtractFromRawHTML returns an article object from the raw HTML content
func (g Goose) ExtractFromRawHTML(RawHTML string, url string) (*Article, error) {
cc := NewCrawler(g.config)
return cc.Crawl(RawHTML, url)
}