MM-60351 Use oEmbed for YouTube links (#28312)

* Split up handling of permalinks and other links in getLinkMetadata

* MM-60351 Use oEmbed for YouTube links

* Explicitly request json from the oEmbed provider

* Fix linter

* Fix type of CacheAge field

* Address feedback
This commit is contained in:
Harrison Healey 2024-10-01 14:06:45 -04:00 committed by GitHub
parent d53a2ef4df
commit 76021c76a0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 549 additions and 73 deletions

View File

@ -0,0 +1,42 @@
// Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved.
// See LICENSE.txt for license information.
package oembed
import (
"net/url"
"regexp"
)
//go:generate go run ./generator/providers_generator.go
type ProviderEndpoint struct {
URL string
Patterns []*regexp.Regexp
}
func (e *ProviderEndpoint) GetProviderURL(requestURL string) string {
// This error is checked when generating the list of providers
url, _ := url.Parse(e.URL)
query := url.Query()
query.Add("format", "json")
query.Add("url", requestURL)
url.RawQuery = query.Encode()
return url.String()
}
// FindEndpointForURL returns a ProviderEndpoint for a given URL if it matches one that's supported by us. Returns nil
// if none of the supported providers match the given URL.
func FindEndpointForURL(requestURL string) *ProviderEndpoint {
for _, provider := range providers {
for _, pattern := range provider.Patterns {
if pattern.MatchString(requestURL) {
return provider
}
}
}
return nil
}

View File

@ -0,0 +1,63 @@
// Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved.
// See LICENSE.txt for license information.
package oembed
import (
"slices"
"testing"
"github.com/stretchr/testify/assert"
)
func TestFindEndpointForURL(t *testing.T) {
youtubeProvider := providers[slices.IndexFunc(providers, func(provider *ProviderEndpoint) bool {
return provider.URL == "https://www.youtube.com/oembed"
})]
for _, testCase := range []struct {
Name string
Input string
Expected *ProviderEndpoint
}{
{
Name: "random URL",
Input: "https://example.com/some/random.url",
Expected: nil,
},
{
Name: "YouTube home page",
Input: "https://www.youtube.com",
Expected: nil,
},
{
Name: "YouTube video",
Input: "https://www.youtube.com/watch?v=szfZfQFUSnU",
Expected: youtubeProvider,
},
{
Name: "YouTube video with short link and tracking information",
Input: "https://youtu.be/Qq3zukqBFqQ?si=iK_TPT20H30mH90G",
Expected: youtubeProvider,
},
{
Name: "YouTube video with playlist",
Input: "https://www.youtube.com/watch?v=Qq3zukqBFqQ&list=PL-jqvaPsjQpMqnRgFEw_3fuGQbcVDTpaM",
Expected: youtubeProvider,
},
{
Name: "YouTube playlist",
Input: "https://www.youtube.com/playlist?list=PL-jqvaPsjQpMqnRgFEw_3fuGQbcVDTpaM",
Expected: youtubeProvider,
},
{
Name: "YouTube channel",
Input: "https://www.youtube.com/@MattermostHQ",
Expected: nil,
},
} {
t.Run(testCase.Name, func(t *testing.T) {
assert.Equal(t, testCase.Expected, FindEndpointForURL(testCase.Input))
})
}
}

View File

@ -0,0 +1 @@
providers.json

View File

@ -0,0 +1,30 @@
// Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved.
// See LICENSE.txt for license information.
// Code generated by "go generate ./channels/app/oembed"
// DO NOT EDIT
//go:generate go run ./generator/providers_generator.go
package oembed
import (
"regexp"
)
var providers []*ProviderEndpoint
func init() {
providers = []*ProviderEndpoint{
{{- range .Endpoints }}
{
URL: "{{ .URL }}",
Patterns: []*regexp.Regexp{
{{- range .Patterns }}
regexp.MustCompile(`{{ . }}`),
{{- end }}
},
},
{{- end }}
}
}

View File

@ -0,0 +1,166 @@
// Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved.
// See LICENSE.txt for license information.
package main
import (
"encoding/json"
"fmt"
"net/url"
"os"
"regexp"
"slices"
"strings"
"text/template"
"github.com/mattermost/mattermost/server/v8/channels/app/oembed"
"github.com/pkg/errors"
)
// To update the list of oEmbed providers that we support:
// 1. Download the latest providers.json file from https://oembed.com/providers.json and place it in this folder
// 2. If desired, update supportedProviders below to add the names of additional oEmbed providers that we want to use
// 3. Run `go generate ./channels/app/oembed` from the server folder
var (
// supportedProviders contains the names of all of the oEmbed providers that we currently support.
//
// As of writing, we're only going to support YouTube because they've stopped giving us the required OpenGraph
// metadata. When we want to support oEmbed embeds for other providers, this will need to be updated.
supportedProviders = []string{
"YouTube",
}
outputTemplate = template.Must(template.New("providers.go.tmpl").ParseFiles("./generator/providers.go.tmpl"))
)
type oEmbedProvider struct {
ProviderName string `json:"provider_name"`
ProviderURL string `json:"provider_url"`
Endpoints []*oEmbedEndpoint `json:"endpoints"`
}
type oEmbedEndpoint struct {
Schemes []string `json:"schemes,omitempty"`
URL string `json:"url"`
Discovery bool `json:"discovery,omitempty"`
Formats []string `json:"formats,omitempty"`
}
func main() {
inputJson, err := os.ReadFile("./generator/providers.json")
if err != nil {
panic(errors.Wrap(err, "Unable to read providers.json. Did you forget to put it next to providers_generator.go?"))
}
outputFile, err := os.Create("./providers_gen.go")
if err != nil {
panic(errors.Wrap(err, "Unable to open output file"))
}
defer outputFile.Close()
var input []*oEmbedProvider
err = json.Unmarshal(inputJson, &input)
if err != nil {
panic(errors.Wrap(err, "Unable to read providers.json"))
}
var endpoints []*oembed.ProviderEndpoint
for _, inputProvider := range input {
if !slices.Contains(supportedProviders, inputProvider.ProviderName) {
continue
}
providerEndpoints, extractErr := extractEndpointsFromProvider(inputProvider)
if extractErr != nil {
panic(errors.Wrap(extractErr, "Unable to convert oEmbedProvider from providers.json to a ProviderEndpoint"))
}
endpoints = append(endpoints, providerEndpoints...)
}
err = outputTemplate.Execute(outputFile, map[string]any{
"Endpoints": endpoints,
})
if err != nil {
panic(errors.Wrap(err, "Unable to write file using template"))
}
}
// extractEndpointsFromProvider turns the data for one provider into providers.json into multiple, more compact ProviderEndpoints
func extractEndpointsFromProvider(in *oEmbedProvider) ([]*oembed.ProviderEndpoint, error) {
var out []*oembed.ProviderEndpoint
for _, endpoint := range in.Endpoints {
// Ensure that the endpoint URL is valid so that we don't need to error check it at runtime
_, err := url.Parse(endpoint.URL)
if err != nil {
return nil, err
}
var patterns []*regexp.Regexp
for _, scheme := range endpoint.Schemes {
pattern, err := schemeToPattern(scheme)
if err != nil {
return nil, err
}
patterns = append(patterns, pattern)
}
if len(patterns) > 0 {
out = append(out, &oembed.ProviderEndpoint{
URL: endpoint.URL,
Patterns: patterns,
})
}
}
return out, nil
}
func schemeToPattern(scheme string) (*regexp.Regexp, error) {
partsPattern := regexp.MustCompile(`^(\w+:(?://)?)([^/]*)(/[^?]*)?(\?[^?]*)?$`)
parts := partsPattern.FindStringSubmatch(scheme)
if parts == nil {
return nil, fmt.Errorf("unable to split scheme %s into parts", scheme)
} else if len(parts) != 5 {
return nil, fmt.Errorf("wrong number of parts for scheme %s", scheme)
}
protocol := parts[1]
if protocol != "http://" && protocol != "https://" && protocol != "spotify:" {
return nil, fmt.Errorf("unrecognized protocol %s for scheme %s", protocol, scheme)
}
domain := parts[2]
if domain == "" {
return nil, fmt.Errorf("no domain found for scheme %s", scheme)
}
path := parts[3]
if path == "" && protocol != "spotify:" {
return nil, fmt.Errorf("no path found for scheme %s", scheme)
}
query := parts[4]
// Replace any valid wildcards with a temporary character so that we can escape any regexp special characters
domain = strings.Replace(domain, "*", "%", -1)
path = strings.Replace(path, "*", "%", -1)
query = strings.Replace(query, "*", "%", -1)
// Escape any other special characters
protocol = regexp.QuoteMeta(protocol)
domain = regexp.QuoteMeta(domain)
path = regexp.QuoteMeta(path)
query = regexp.QuoteMeta(query)
// Replace the temporary character with the proper regexp to match a wildcard in that part of the URL
domain = strings.Replace(domain, "%", "[^/]*?", -1)
path = strings.Replace(path, "%", ".*?", -1)
query = strings.Replace(query, "%", ".*?", -1)
// Allow http schemes to match https URLs as well
if protocol == "http://" {
protocol = "https?://"
}
return regexp.Compile("^" + protocol + domain + path + query + "$")
}

View File

@ -0,0 +1,56 @@
// Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved.
// See LICENSE.txt for license information.
package oembed
import (
"encoding/json"
"fmt"
"io"
)
type OEmbedResponse struct {
// Type can be one of "photo", "video", "link", or "rich"
Type string `json:"type"`
// Fields that may be defined for any response type
Version string `json:"version"`
Title string `json:"title,omitempty"`
AuthorName string `json:"author_name,omitempty"`
AuthorURL string `json:"author_url,omitempty"`
ProviderName string `json:"provider_name,omitempty"`
ProviderURL string `json:"provider_url,omitempty"`
CacheAge string `json:"cache_age,omitempty"`
ThumbnailURL string `json:"thumbnail_url,omitempty"`
ThumbnailWidth int `json:"thumbnail_width,omitempty"`
ThumbnailHeight int `json:"thumbnail_height,omitempty"`
// Fields that are required for responses with the type "photo"
URL string `json:"url"`
// Fields that are required for responses of the type "video" or "rich"
HTML string `json:"html"`
// Fields that are required for responses with the type "photo", "video", or "rich"
Width int `json:"width"`
Height int `json:"height"`
}
func ResponseFromJSON(r io.Reader) (*OEmbedResponse, error) {
var response OEmbedResponse
err := json.NewDecoder(r).Decode(&response)
if err != nil {
return nil, err
}
// Do a quick smoke test to confirm that this is hopefully a valid oEmbed response
if response.Version != "1.0" {
return nil, fmt.Errorf("ResponseFromJson: Received unsupported response version %s", response.Version)
}
if response.Type != "photo" && response.Type != "video" && response.Type != "link" && response.Type != "rich" {
return nil, fmt.Errorf("ResponseFromJson: Received unsupported response type %s", response.Type)
}
return &response, nil
}

View File

@ -0,0 +1,35 @@
// Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved.
// See LICENSE.txt for license information.
// Code generated by "go generate ./channels/app/oembed"
// DO NOT EDIT
//go:generate go run ./generator/providers_generator.go
package oembed
import (
"regexp"
)
var providers []*ProviderEndpoint
func init() {
providers = []*ProviderEndpoint{
{
URL: "https://www.youtube.com/oembed",
Patterns: []*regexp.Regexp{
regexp.MustCompile(`^https://[^/]*?\.youtube\.com/watch.*?$`),
regexp.MustCompile(`^https://[^/]*?\.youtube\.com/v/.*?$`),
regexp.MustCompile(`^https://youtu\.be/.*?$`),
regexp.MustCompile(`^https://[^/]*?\.youtube\.com/playlist\?list=.*?$`),
regexp.MustCompile(`^https://youtube\.com/playlist\?list=.*?$`),
regexp.MustCompile(`^https://[^/]*?\.youtube\.com/shorts.*?$`),
regexp.MustCompile(`^https://youtube\.com/shorts.*?$`),
regexp.MustCompile(`^https://[^/]*?\.youtube\.com/embed/.*?$`),
regexp.MustCompile(`^https://[^/]*?\.youtube\.com/live.*?$`),
regexp.MustCompile(`^https://youtube\.com/live.*?$`),
},
},
}
}

View File

@ -10,9 +10,12 @@ import (
"time"
"github.com/dyatlov/go-opengraph/opengraph"
ogImage "github.com/dyatlov/go-opengraph/opengraph/types/image"
"github.com/pkg/errors"
"golang.org/x/net/html/charset"
"github.com/mattermost/mattermost/server/public/shared/mlog"
"github.com/mattermost/mattermost/server/v8/channels/app/oembed"
)
const (
@ -144,3 +147,31 @@ func openGraphDecodeHTMLEntities(og *opengraph.OpenGraph) {
og.Title = html.UnescapeString(og.Title)
og.Description = html.UnescapeString(og.Description)
}
func (a *App) parseOpenGraphFromOEmbed(requestURL string, body io.Reader) (*opengraph.OpenGraph, error) {
oEmbedResponse, err := oembed.ResponseFromJSON(io.LimitReader(body, MaxOpenGraphResponseSize))
if err != nil {
return nil, errors.Wrap(err, "parseOpenGraphFromOEmbed: Unable to parse oEmbed response")
}
og := &opengraph.OpenGraph{
Type: "opengraph",
Title: oEmbedResponse.Title,
URL: requestURL,
}
if oEmbedResponse.ThumbnailURL != "" {
og.Images = append(og.Images, &ogImage.Image{
Type: "image",
URL: oEmbedResponse.ThumbnailURL,
Width: uint64(oEmbedResponse.ThumbnailWidth),
Height: uint64(oEmbedResponse.ThumbnailHeight),
})
}
if toProxyURL := a.ImageProxyAdder(); toProxyURL != nil {
og = openGraphDataWithProxyAddedToImageURLs(og, toProxyURL)
}
return og, nil
}

View File

@ -17,12 +17,14 @@ import (
"time"
"github.com/dyatlov/go-opengraph/opengraph"
"github.com/pkg/errors"
"golang.org/x/net/idna"
"github.com/mattermost/mattermost/server/public/model"
"github.com/mattermost/mattermost/server/public/shared/markdown"
"github.com/mattermost/mattermost/server/public/shared/mlog"
"github.com/mattermost/mattermost/server/public/shared/request"
"github.com/mattermost/mattermost/server/v8/channels/app/oembed"
"github.com/mattermost/mattermost/server/v8/channels/app/platform"
"github.com/mattermost/mattermost/server/v8/channels/utils/imgutils"
)
@ -646,17 +648,39 @@ func (a *App) getLinkMetadata(c request.CTX, requestURL string, timestamp int64,
var err error
if looksLikeAPermalink(requestURL, a.GetSiteURL()) && *a.Config().ServiceSettings.EnablePermalinkPreviews {
permalink, err = a.getLinkMetadataForPermalink(c, requestURL)
if err != nil {
return nil, nil, nil, err
}
} else if oEmbedProvider := oembed.FindEndpointForURL(requestURL); oEmbedProvider != nil {
og, err = a.getLinkMetadataFromOEmbed(c, requestURL, oEmbedProvider)
} else {
og, image, err = a.getLinkMetadataForURL(c, requestURL)
// We intentionally don't return early on an error because we want to save that there is no metadata for this link
a.saveLinkMetadataToDatabase(requestURL, timestamp, og, image)
}
// Write back to cache and database, even if there was an error and the results are nil
cacheLinkMetadata(requestURL, timestamp, og, image, permalink)
return og, image, permalink, err
}
func (a *App) getLinkMetadataForPermalink(c request.CTX, requestURL string) (*model.Permalink, error) {
referencedPostID := requestURL[len(requestURL)-26:]
referencedPost, appErr := a.GetSinglePost(c, referencedPostID, false)
// TODO: Look into saving a value in the LinkMetadata.Data field to prevent perpetually re-querying for the deleted post.
if appErr != nil {
return nil, nil, nil, appErr
return nil, appErr
}
referencedChannel, appErr := a.GetChannel(c, referencedPost.ChannelId)
if appErr != nil {
return nil, nil, nil, appErr
return nil, appErr
}
var referencedTeam *model.Team
@ -665,11 +689,12 @@ func (a *App) getLinkMetadata(c request.CTX, requestURL string, timestamp int64,
} else {
referencedTeam, appErr = a.GetTeam(referencedChannel.TeamId)
if appErr != nil {
return nil, nil, nil, appErr
return nil, appErr
}
}
// Get metadata for embedded post
var permalink *model.Permalink
if a.containsPermalink(c, referencedPost) {
// referencedPost contains a permalink: we don't get its metadata
permalink = &model.Permalink{PreviewPost: model.NewPreviewPost(referencedPost, referencedTeam, referencedChannel)}
@ -678,12 +703,42 @@ func (a *App) getLinkMetadata(c request.CTX, requestURL string, timestamp int64,
referencedPostWithMetadata := a.PreparePostForClientWithEmbedsAndImages(c, referencedPost, false, false, false)
permalink = &model.Permalink{PreviewPost: model.NewPreviewPost(referencedPostWithMetadata, referencedTeam, referencedChannel)}
}
} else {
return permalink, nil
}
func (a *App) getLinkMetadataFromOEmbed(c request.CTX, requestURL string, provider *oembed.ProviderEndpoint) (*opengraph.OpenGraph, error) {
request, err := http.NewRequest("GET", provider.GetProviderURL(requestURL), nil)
if err != nil {
return nil, err
}
request.Header.Add("Accept", "application/json")
request.Header.Add("Accept-Language", *a.Config().LocalizationSettings.DefaultServerLocale)
client := a.HTTPService().MakeClient(false)
client.Timeout = time.Duration(*a.Config().ExperimentalSettings.LinkMetadataTimeoutMilliseconds) * time.Millisecond
res, err := client.Do(request)
if err != nil {
c.Logger().Warn("error fetching oEmbed data", mlog.Err(err))
return nil, errors.Wrap(err, "getLinkMetadataFromOEmbed: Unable to get oEmbed data")
}
defer func() {
io.Copy(io.Discard, res.Body)
res.Body.Close()
}()
return a.parseOpenGraphFromOEmbed(requestURL, res.Body)
}
func (a *App) getLinkMetadataForURL(c request.CTX, requestURL string) (*opengraph.OpenGraph, *model.PostImage, error) {
var request *http.Request
// Make request for a web page or an image
request, err = http.NewRequest("GET", requestURL, nil)
request, err := http.NewRequest("GET", requestURL, nil)
if err != nil {
return nil, nil, nil, err
return nil, nil, err
}
var body io.ReadCloser
@ -719,19 +774,16 @@ func (a *App) getLinkMetadata(c request.CTX, requestURL string, timestamp int64,
}()
}
var og *opengraph.OpenGraph
var image *model.PostImage
if err == nil {
// Parse the data
og, image, err = a.parseLinkMetadata(requestURL, body, contentType)
}
og = model.TruncateOpenGraph(og) // remove unwanted length of texts
a.saveLinkMetadataToDatabase(requestURL, timestamp, og, image)
}
// Write back to cache and database, even if there was an error and the results are nil
cacheLinkMetadata(requestURL, timestamp, og, image, permalink)
return og, image, permalink, err
return og, image, err
}
// resolveMetadataURL resolves a given URL relative to the server's site URL.