diff --git a/server/channels/app/oembed/endpoint.go b/server/channels/app/oembed/endpoint.go new file mode 100644 index 0000000000..4f690953e2 --- /dev/null +++ b/server/channels/app/oembed/endpoint.go @@ -0,0 +1,42 @@ +// Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved. +// See LICENSE.txt for license information. + +package oembed + +import ( + "net/url" + "regexp" +) + +//go:generate go run ./generator/providers_generator.go + +type ProviderEndpoint struct { + URL string + Patterns []*regexp.Regexp +} + +func (e *ProviderEndpoint) GetProviderURL(requestURL string) string { + // This error is checked when generating the list of providers + url, _ := url.Parse(e.URL) + + query := url.Query() + query.Add("format", "json") + query.Add("url", requestURL) + url.RawQuery = query.Encode() + + return url.String() +} + +// FindEndpointForURL returns a ProviderEndpoint for a given URL if it matches one that's supported by us. Returns nil +// if none of the supported providers match the given URL. +func FindEndpointForURL(requestURL string) *ProviderEndpoint { + for _, provider := range providers { + for _, pattern := range provider.Patterns { + if pattern.MatchString(requestURL) { + return provider + } + } + } + + return nil +} diff --git a/server/channels/app/oembed/endpoint_test.go b/server/channels/app/oembed/endpoint_test.go new file mode 100644 index 0000000000..4b6d54af42 --- /dev/null +++ b/server/channels/app/oembed/endpoint_test.go @@ -0,0 +1,63 @@ +// Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved. +// See LICENSE.txt for license information. + +package oembed + +import ( + "slices" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestFindEndpointForURL(t *testing.T) { + youtubeProvider := providers[slices.IndexFunc(providers, func(provider *ProviderEndpoint) bool { + return provider.URL == "https://www.youtube.com/oembed" + })] + + for _, testCase := range []struct { + Name string + Input string + Expected *ProviderEndpoint + }{ + { + Name: "random URL", + Input: "https://example.com/some/random.url", + Expected: nil, + }, + { + Name: "YouTube home page", + Input: "https://www.youtube.com", + Expected: nil, + }, + { + Name: "YouTube video", + Input: "https://www.youtube.com/watch?v=szfZfQFUSnU", + Expected: youtubeProvider, + }, + { + Name: "YouTube video with short link and tracking information", + Input: "https://youtu.be/Qq3zukqBFqQ?si=iK_TPT20H30mH90G", + Expected: youtubeProvider, + }, + { + Name: "YouTube video with playlist", + Input: "https://www.youtube.com/watch?v=Qq3zukqBFqQ&list=PL-jqvaPsjQpMqnRgFEw_3fuGQbcVDTpaM", + Expected: youtubeProvider, + }, + { + Name: "YouTube playlist", + Input: "https://www.youtube.com/playlist?list=PL-jqvaPsjQpMqnRgFEw_3fuGQbcVDTpaM", + Expected: youtubeProvider, + }, + { + Name: "YouTube channel", + Input: "https://www.youtube.com/@MattermostHQ", + Expected: nil, + }, + } { + t.Run(testCase.Name, func(t *testing.T) { + assert.Equal(t, testCase.Expected, FindEndpointForURL(testCase.Input)) + }) + } +} diff --git a/server/channels/app/oembed/generator/.gitignore b/server/channels/app/oembed/generator/.gitignore new file mode 100644 index 0000000000..c6603442c4 --- /dev/null +++ b/server/channels/app/oembed/generator/.gitignore @@ -0,0 +1 @@ +providers.json diff --git a/server/channels/app/oembed/generator/providers.go.tmpl b/server/channels/app/oembed/generator/providers.go.tmpl new file mode 100644 index 0000000000..48657e7a90 --- /dev/null +++ b/server/channels/app/oembed/generator/providers.go.tmpl @@ -0,0 +1,30 @@ +// Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved. +// See LICENSE.txt for license information. + +// Code generated by "go generate ./channels/app/oembed" +// DO NOT EDIT + +//go:generate go run ./generator/providers_generator.go + +package oembed + +import ( + "regexp" +) + +var providers []*ProviderEndpoint + +func init() { + providers = []*ProviderEndpoint{ + {{- range .Endpoints }} + { + URL: "{{ .URL }}", + Patterns: []*regexp.Regexp{ + {{- range .Patterns }} + regexp.MustCompile(`{{ . }}`), + {{- end }} + }, + }, + {{- end }} + } +} diff --git a/server/channels/app/oembed/generator/providers_generator.go b/server/channels/app/oembed/generator/providers_generator.go new file mode 100644 index 0000000000..54a0b23827 --- /dev/null +++ b/server/channels/app/oembed/generator/providers_generator.go @@ -0,0 +1,166 @@ +// Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved. +// See LICENSE.txt for license information. + +package main + +import ( + "encoding/json" + "fmt" + "net/url" + "os" + "regexp" + "slices" + "strings" + "text/template" + + "github.com/mattermost/mattermost/server/v8/channels/app/oembed" + "github.com/pkg/errors" +) + +// To update the list of oEmbed providers that we support: +// 1. Download the latest providers.json file from https://oembed.com/providers.json and place it in this folder +// 2. If desired, update supportedProviders below to add the names of additional oEmbed providers that we want to use +// 3. Run `go generate ./channels/app/oembed` from the server folder + +var ( + // supportedProviders contains the names of all of the oEmbed providers that we currently support. + // + // As of writing, we're only going to support YouTube because they've stopped giving us the required OpenGraph + // metadata. When we want to support oEmbed embeds for other providers, this will need to be updated. + supportedProviders = []string{ + "YouTube", + } + + outputTemplate = template.Must(template.New("providers.go.tmpl").ParseFiles("./generator/providers.go.tmpl")) +) + +type oEmbedProvider struct { + ProviderName string `json:"provider_name"` + ProviderURL string `json:"provider_url"` + Endpoints []*oEmbedEndpoint `json:"endpoints"` +} + +type oEmbedEndpoint struct { + Schemes []string `json:"schemes,omitempty"` + URL string `json:"url"` + Discovery bool `json:"discovery,omitempty"` + Formats []string `json:"formats,omitempty"` +} + +func main() { + inputJson, err := os.ReadFile("./generator/providers.json") + if err != nil { + panic(errors.Wrap(err, "Unable to read providers.json. Did you forget to put it next to providers_generator.go?")) + } + + outputFile, err := os.Create("./providers_gen.go") + if err != nil { + panic(errors.Wrap(err, "Unable to open output file")) + } + defer outputFile.Close() + + var input []*oEmbedProvider + err = json.Unmarshal(inputJson, &input) + if err != nil { + panic(errors.Wrap(err, "Unable to read providers.json")) + } + + var endpoints []*oembed.ProviderEndpoint + for _, inputProvider := range input { + if !slices.Contains(supportedProviders, inputProvider.ProviderName) { + continue + } + + providerEndpoints, extractErr := extractEndpointsFromProvider(inputProvider) + if extractErr != nil { + panic(errors.Wrap(extractErr, "Unable to convert oEmbedProvider from providers.json to a ProviderEndpoint")) + } + endpoints = append(endpoints, providerEndpoints...) + } + + err = outputTemplate.Execute(outputFile, map[string]any{ + "Endpoints": endpoints, + }) + if err != nil { + panic(errors.Wrap(err, "Unable to write file using template")) + } +} + +// extractEndpointsFromProvider turns the data for one provider into providers.json into multiple, more compact ProviderEndpoints +func extractEndpointsFromProvider(in *oEmbedProvider) ([]*oembed.ProviderEndpoint, error) { + var out []*oembed.ProviderEndpoint + + for _, endpoint := range in.Endpoints { + // Ensure that the endpoint URL is valid so that we don't need to error check it at runtime + _, err := url.Parse(endpoint.URL) + if err != nil { + return nil, err + } + + var patterns []*regexp.Regexp + for _, scheme := range endpoint.Schemes { + pattern, err := schemeToPattern(scheme) + if err != nil { + return nil, err + } + + patterns = append(patterns, pattern) + } + + if len(patterns) > 0 { + out = append(out, &oembed.ProviderEndpoint{ + URL: endpoint.URL, + Patterns: patterns, + }) + } + } + + return out, nil +} + +func schemeToPattern(scheme string) (*regexp.Regexp, error) { + partsPattern := regexp.MustCompile(`^(\w+:(?://)?)([^/]*)(/[^?]*)?(\?[^?]*)?$`) + parts := partsPattern.FindStringSubmatch(scheme) + if parts == nil { + return nil, fmt.Errorf("unable to split scheme %s into parts", scheme) + } else if len(parts) != 5 { + return nil, fmt.Errorf("wrong number of parts for scheme %s", scheme) + } + + protocol := parts[1] + if protocol != "http://" && protocol != "https://" && protocol != "spotify:" { + return nil, fmt.Errorf("unrecognized protocol %s for scheme %s", protocol, scheme) + } + domain := parts[2] + if domain == "" { + return nil, fmt.Errorf("no domain found for scheme %s", scheme) + } + path := parts[3] + if path == "" && protocol != "spotify:" { + return nil, fmt.Errorf("no path found for scheme %s", scheme) + } + query := parts[4] + + // Replace any valid wildcards with a temporary character so that we can escape any regexp special characters + domain = strings.Replace(domain, "*", "%", -1) + path = strings.Replace(path, "*", "%", -1) + query = strings.Replace(query, "*", "%", -1) + + // Escape any other special characters + protocol = regexp.QuoteMeta(protocol) + domain = regexp.QuoteMeta(domain) + path = regexp.QuoteMeta(path) + query = regexp.QuoteMeta(query) + + // Replace the temporary character with the proper regexp to match a wildcard in that part of the URL + domain = strings.Replace(domain, "%", "[^/]*?", -1) + path = strings.Replace(path, "%", ".*?", -1) + query = strings.Replace(query, "%", ".*?", -1) + + // Allow http schemes to match https URLs as well + if protocol == "http://" { + protocol = "https?://" + } + + return regexp.Compile("^" + protocol + domain + path + query + "$") +} diff --git a/server/channels/app/oembed/oembed.go b/server/channels/app/oembed/oembed.go new file mode 100644 index 0000000000..6febfdd7e1 --- /dev/null +++ b/server/channels/app/oembed/oembed.go @@ -0,0 +1,56 @@ +// Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved. +// See LICENSE.txt for license information. + +package oembed + +import ( + "encoding/json" + "fmt" + "io" +) + +type OEmbedResponse struct { + // Type can be one of "photo", "video", "link", or "rich" + Type string `json:"type"` + + // Fields that may be defined for any response type + Version string `json:"version"` + Title string `json:"title,omitempty"` + AuthorName string `json:"author_name,omitempty"` + AuthorURL string `json:"author_url,omitempty"` + ProviderName string `json:"provider_name,omitempty"` + ProviderURL string `json:"provider_url,omitempty"` + CacheAge string `json:"cache_age,omitempty"` + ThumbnailURL string `json:"thumbnail_url,omitempty"` + ThumbnailWidth int `json:"thumbnail_width,omitempty"` + ThumbnailHeight int `json:"thumbnail_height,omitempty"` + + // Fields that are required for responses with the type "photo" + URL string `json:"url"` + + // Fields that are required for responses of the type "video" or "rich" + HTML string `json:"html"` + + // Fields that are required for responses with the type "photo", "video", or "rich" + Width int `json:"width"` + Height int `json:"height"` +} + +func ResponseFromJSON(r io.Reader) (*OEmbedResponse, error) { + var response OEmbedResponse + + err := json.NewDecoder(r).Decode(&response) + if err != nil { + return nil, err + } + + // Do a quick smoke test to confirm that this is hopefully a valid oEmbed response + if response.Version != "1.0" { + return nil, fmt.Errorf("ResponseFromJson: Received unsupported response version %s", response.Version) + } + if response.Type != "photo" && response.Type != "video" && response.Type != "link" && response.Type != "rich" { + return nil, fmt.Errorf("ResponseFromJson: Received unsupported response type %s", response.Type) + } + + return &response, nil +} diff --git a/server/channels/app/oembed/providers_gen.go b/server/channels/app/oembed/providers_gen.go new file mode 100644 index 0000000000..c5be71382f --- /dev/null +++ b/server/channels/app/oembed/providers_gen.go @@ -0,0 +1,35 @@ +// Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved. +// See LICENSE.txt for license information. + +// Code generated by "go generate ./channels/app/oembed" +// DO NOT EDIT + +//go:generate go run ./generator/providers_generator.go + +package oembed + +import ( + "regexp" +) + +var providers []*ProviderEndpoint + +func init() { + providers = []*ProviderEndpoint{ + { + URL: "https://www.youtube.com/oembed", + Patterns: []*regexp.Regexp{ + regexp.MustCompile(`^https://[^/]*?\.youtube\.com/watch.*?$`), + regexp.MustCompile(`^https://[^/]*?\.youtube\.com/v/.*?$`), + regexp.MustCompile(`^https://youtu\.be/.*?$`), + regexp.MustCompile(`^https://[^/]*?\.youtube\.com/playlist\?list=.*?$`), + regexp.MustCompile(`^https://youtube\.com/playlist\?list=.*?$`), + regexp.MustCompile(`^https://[^/]*?\.youtube\.com/shorts.*?$`), + regexp.MustCompile(`^https://youtube\.com/shorts.*?$`), + regexp.MustCompile(`^https://[^/]*?\.youtube\.com/embed/.*?$`), + regexp.MustCompile(`^https://[^/]*?\.youtube\.com/live.*?$`), + regexp.MustCompile(`^https://youtube\.com/live.*?$`), + }, + }, + } +} diff --git a/server/channels/app/opengraph.go b/server/channels/app/opengraph.go index cfa5135326..769eda7109 100644 --- a/server/channels/app/opengraph.go +++ b/server/channels/app/opengraph.go @@ -10,9 +10,12 @@ import ( "time" "github.com/dyatlov/go-opengraph/opengraph" + ogImage "github.com/dyatlov/go-opengraph/opengraph/types/image" + "github.com/pkg/errors" "golang.org/x/net/html/charset" "github.com/mattermost/mattermost/server/public/shared/mlog" + "github.com/mattermost/mattermost/server/v8/channels/app/oembed" ) const ( @@ -144,3 +147,31 @@ func openGraphDecodeHTMLEntities(og *opengraph.OpenGraph) { og.Title = html.UnescapeString(og.Title) og.Description = html.UnescapeString(og.Description) } + +func (a *App) parseOpenGraphFromOEmbed(requestURL string, body io.Reader) (*opengraph.OpenGraph, error) { + oEmbedResponse, err := oembed.ResponseFromJSON(io.LimitReader(body, MaxOpenGraphResponseSize)) + if err != nil { + return nil, errors.Wrap(err, "parseOpenGraphFromOEmbed: Unable to parse oEmbed response") + } + + og := &opengraph.OpenGraph{ + Type: "opengraph", + Title: oEmbedResponse.Title, + URL: requestURL, + } + + if oEmbedResponse.ThumbnailURL != "" { + og.Images = append(og.Images, &ogImage.Image{ + Type: "image", + URL: oEmbedResponse.ThumbnailURL, + Width: uint64(oEmbedResponse.ThumbnailWidth), + Height: uint64(oEmbedResponse.ThumbnailHeight), + }) + } + + if toProxyURL := a.ImageProxyAdder(); toProxyURL != nil { + og = openGraphDataWithProxyAddedToImageURLs(og, toProxyURL) + } + + return og, nil +} diff --git a/server/channels/app/post_metadata.go b/server/channels/app/post_metadata.go index 11481b82a1..71640f6466 100644 --- a/server/channels/app/post_metadata.go +++ b/server/channels/app/post_metadata.go @@ -17,12 +17,14 @@ import ( "time" "github.com/dyatlov/go-opengraph/opengraph" + "github.com/pkg/errors" "golang.org/x/net/idna" "github.com/mattermost/mattermost/server/public/model" "github.com/mattermost/mattermost/server/public/shared/markdown" "github.com/mattermost/mattermost/server/public/shared/mlog" "github.com/mattermost/mattermost/server/public/shared/request" + "github.com/mattermost/mattermost/server/v8/channels/app/oembed" "github.com/mattermost/mattermost/server/v8/channels/app/platform" "github.com/mattermost/mattermost/server/v8/channels/utils/imgutils" ) @@ -646,84 +648,17 @@ func (a *App) getLinkMetadata(c request.CTX, requestURL string, timestamp int64, var err error if looksLikeAPermalink(requestURL, a.GetSiteURL()) && *a.Config().ServiceSettings.EnablePermalinkPreviews { - referencedPostID := requestURL[len(requestURL)-26:] + permalink, err = a.getLinkMetadataForPermalink(c, requestURL) - referencedPost, appErr := a.GetSinglePost(c, referencedPostID, false) - // TODO: Look into saving a value in the LinkMetadata.Data field to prevent perpetually re-querying for the deleted post. - if appErr != nil { - return nil, nil, nil, appErr - } - - referencedChannel, appErr := a.GetChannel(c, referencedPost.ChannelId) - if appErr != nil { - return nil, nil, nil, appErr - } - - var referencedTeam *model.Team - if referencedChannel.Type == model.ChannelTypeDirect || referencedChannel.Type == model.ChannelTypeGroup { - referencedTeam = &model.Team{} - } else { - referencedTeam, appErr = a.GetTeam(referencedChannel.TeamId) - if appErr != nil { - return nil, nil, nil, appErr - } - } - - // Get metadata for embedded post - if a.containsPermalink(c, referencedPost) { - // referencedPost contains a permalink: we don't get its metadata - permalink = &model.Permalink{PreviewPost: model.NewPreviewPost(referencedPost, referencedTeam, referencedChannel)} - } else { - // referencedPost does not contain a permalink: we get its metadata - referencedPostWithMetadata := a.PreparePostForClientWithEmbedsAndImages(c, referencedPost, false, false, false) - permalink = &model.Permalink{PreviewPost: model.NewPreviewPost(referencedPostWithMetadata, referencedTeam, referencedChannel)} - } - } else { - var request *http.Request - // Make request for a web page or an image - request, err = http.NewRequest("GET", requestURL, nil) if err != nil { return nil, nil, nil, err } + } else if oEmbedProvider := oembed.FindEndpointForURL(requestURL); oEmbedProvider != nil { + og, err = a.getLinkMetadataFromOEmbed(c, requestURL, oEmbedProvider) + } else { + og, image, err = a.getLinkMetadataForURL(c, requestURL) - var body io.ReadCloser - var contentType string - - if (request.URL.Scheme+"://"+request.URL.Host) == a.GetSiteURL() && request.URL.Path == "/api/v4/image" { - // /api/v4/image requires authentication, so bypass the API by hitting the proxy directly - body, contentType, err = a.ImageProxy().GetImageDirect(a.ImageProxy().GetUnproxiedImageURL(request.URL.String())) - } else { - request.Header.Add("Accept", "image/*") - request.Header.Add("Accept", "text/html;q=0.8") - request.Header.Add("Accept-Language", *a.Config().LocalizationSettings.DefaultServerLocale) - - client := a.HTTPService().MakeClient(false) - client.Timeout = time.Duration(*a.Config().ExperimentalSettings.LinkMetadataTimeoutMilliseconds) * time.Millisecond - - var res *http.Response - res, err = client.Do(request) - if err != nil { - c.Logger().Warn("error fetching OG image data", mlog.Err(err)) - } - - if res != nil { - body = res.Body - contentType = res.Header.Get("Content-Type") - } - } - - if body != nil { - defer func() { - io.Copy(io.Discard, body) - body.Close() - }() - } - - if err == nil { - // Parse the data - og, image, err = a.parseLinkMetadata(requestURL, body, contentType) - } - og = model.TruncateOpenGraph(og) // remove unwanted length of texts + // We intentionally don't return early on an error because we want to save that there is no metadata for this link a.saveLinkMetadataToDatabase(requestURL, timestamp, og, image) } @@ -734,6 +669,123 @@ func (a *App) getLinkMetadata(c request.CTX, requestURL string, timestamp int64, return og, image, permalink, err } +func (a *App) getLinkMetadataForPermalink(c request.CTX, requestURL string) (*model.Permalink, error) { + referencedPostID := requestURL[len(requestURL)-26:] + + referencedPost, appErr := a.GetSinglePost(c, referencedPostID, false) + // TODO: Look into saving a value in the LinkMetadata.Data field to prevent perpetually re-querying for the deleted post. + if appErr != nil { + return nil, appErr + } + + referencedChannel, appErr := a.GetChannel(c, referencedPost.ChannelId) + if appErr != nil { + return nil, appErr + } + + var referencedTeam *model.Team + if referencedChannel.Type == model.ChannelTypeDirect || referencedChannel.Type == model.ChannelTypeGroup { + referencedTeam = &model.Team{} + } else { + referencedTeam, appErr = a.GetTeam(referencedChannel.TeamId) + if appErr != nil { + return nil, appErr + } + } + + // Get metadata for embedded post + var permalink *model.Permalink + if a.containsPermalink(c, referencedPost) { + // referencedPost contains a permalink: we don't get its metadata + permalink = &model.Permalink{PreviewPost: model.NewPreviewPost(referencedPost, referencedTeam, referencedChannel)} + } else { + // referencedPost does not contain a permalink: we get its metadata + referencedPostWithMetadata := a.PreparePostForClientWithEmbedsAndImages(c, referencedPost, false, false, false) + permalink = &model.Permalink{PreviewPost: model.NewPreviewPost(referencedPostWithMetadata, referencedTeam, referencedChannel)} + } + + return permalink, nil +} + +func (a *App) getLinkMetadataFromOEmbed(c request.CTX, requestURL string, provider *oembed.ProviderEndpoint) (*opengraph.OpenGraph, error) { + request, err := http.NewRequest("GET", provider.GetProviderURL(requestURL), nil) + if err != nil { + return nil, err + } + + request.Header.Add("Accept", "application/json") + request.Header.Add("Accept-Language", *a.Config().LocalizationSettings.DefaultServerLocale) + + client := a.HTTPService().MakeClient(false) + client.Timeout = time.Duration(*a.Config().ExperimentalSettings.LinkMetadataTimeoutMilliseconds) * time.Millisecond + + res, err := client.Do(request) + if err != nil { + c.Logger().Warn("error fetching oEmbed data", mlog.Err(err)) + return nil, errors.Wrap(err, "getLinkMetadataFromOEmbed: Unable to get oEmbed data") + } + + defer func() { + io.Copy(io.Discard, res.Body) + res.Body.Close() + }() + + return a.parseOpenGraphFromOEmbed(requestURL, res.Body) +} + +func (a *App) getLinkMetadataForURL(c request.CTX, requestURL string) (*opengraph.OpenGraph, *model.PostImage, error) { + var request *http.Request + // Make request for a web page or an image + request, err := http.NewRequest("GET", requestURL, nil) + if err != nil { + return nil, nil, err + } + + var body io.ReadCloser + var contentType string + + if (request.URL.Scheme+"://"+request.URL.Host) == a.GetSiteURL() && request.URL.Path == "/api/v4/image" { + // /api/v4/image requires authentication, so bypass the API by hitting the proxy directly + body, contentType, err = a.ImageProxy().GetImageDirect(a.ImageProxy().GetUnproxiedImageURL(request.URL.String())) + } else { + request.Header.Add("Accept", "image/*") + request.Header.Add("Accept", "text/html;q=0.8") + request.Header.Add("Accept-Language", *a.Config().LocalizationSettings.DefaultServerLocale) + + client := a.HTTPService().MakeClient(false) + client.Timeout = time.Duration(*a.Config().ExperimentalSettings.LinkMetadataTimeoutMilliseconds) * time.Millisecond + + var res *http.Response + res, err = client.Do(request) + if err != nil { + c.Logger().Warn("error fetching OG image data", mlog.Err(err)) + } + + if res != nil { + body = res.Body + contentType = res.Header.Get("Content-Type") + } + } + + if body != nil { + defer func() { + io.Copy(io.Discard, body) + body.Close() + }() + } + + var og *opengraph.OpenGraph + var image *model.PostImage + + if err == nil { + // Parse the data + og, image, err = a.parseLinkMetadata(requestURL, body, contentType) + } + og = model.TruncateOpenGraph(og) // remove unwanted length of texts + + return og, image, err +} + // resolveMetadataURL resolves a given URL relative to the server's site URL. func resolveMetadataURL(requestURL string, siteURL string) string { base, err := url.Parse(siteURL)