MM-60351 Use oEmbed for YouTube links (#28312)

* Split up handling of permalinks and other links in getLinkMetadata

* MM-60351 Use oEmbed for YouTube links

* Explicitly request json from the oEmbed provider

* Fix linter

* Fix type of CacheAge field

* Address feedback
This commit is contained in:
Harrison Healey 2024-10-01 14:06:45 -04:00 committed by GitHub
parent d53a2ef4df
commit 76021c76a0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 549 additions and 73 deletions

View File

@ -0,0 +1,42 @@
// Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved.
// See LICENSE.txt for license information.
package oembed
import (
"net/url"
"regexp"
)
//go:generate go run ./generator/providers_generator.go
type ProviderEndpoint struct {
URL string
Patterns []*regexp.Regexp
}
func (e *ProviderEndpoint) GetProviderURL(requestURL string) string {
// This error is checked when generating the list of providers
url, _ := url.Parse(e.URL)
query := url.Query()
query.Add("format", "json")
query.Add("url", requestURL)
url.RawQuery = query.Encode()
return url.String()
}
// FindEndpointForURL returns a ProviderEndpoint for a given URL if it matches one that's supported by us. Returns nil
// if none of the supported providers match the given URL.
func FindEndpointForURL(requestURL string) *ProviderEndpoint {
for _, provider := range providers {
for _, pattern := range provider.Patterns {
if pattern.MatchString(requestURL) {
return provider
}
}
}
return nil
}

View File

@ -0,0 +1,63 @@
// Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved.
// See LICENSE.txt for license information.
package oembed
import (
"slices"
"testing"
"github.com/stretchr/testify/assert"
)
func TestFindEndpointForURL(t *testing.T) {
youtubeProvider := providers[slices.IndexFunc(providers, func(provider *ProviderEndpoint) bool {
return provider.URL == "https://www.youtube.com/oembed"
})]
for _, testCase := range []struct {
Name string
Input string
Expected *ProviderEndpoint
}{
{
Name: "random URL",
Input: "https://example.com/some/random.url",
Expected: nil,
},
{
Name: "YouTube home page",
Input: "https://www.youtube.com",
Expected: nil,
},
{
Name: "YouTube video",
Input: "https://www.youtube.com/watch?v=szfZfQFUSnU",
Expected: youtubeProvider,
},
{
Name: "YouTube video with short link and tracking information",
Input: "https://youtu.be/Qq3zukqBFqQ?si=iK_TPT20H30mH90G",
Expected: youtubeProvider,
},
{
Name: "YouTube video with playlist",
Input: "https://www.youtube.com/watch?v=Qq3zukqBFqQ&list=PL-jqvaPsjQpMqnRgFEw_3fuGQbcVDTpaM",
Expected: youtubeProvider,
},
{
Name: "YouTube playlist",
Input: "https://www.youtube.com/playlist?list=PL-jqvaPsjQpMqnRgFEw_3fuGQbcVDTpaM",
Expected: youtubeProvider,
},
{
Name: "YouTube channel",
Input: "https://www.youtube.com/@MattermostHQ",
Expected: nil,
},
} {
t.Run(testCase.Name, func(t *testing.T) {
assert.Equal(t, testCase.Expected, FindEndpointForURL(testCase.Input))
})
}
}

View File

@ -0,0 +1 @@
providers.json

View File

@ -0,0 +1,30 @@
// Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved.
// See LICENSE.txt for license information.
// Code generated by "go generate ./channels/app/oembed"
// DO NOT EDIT
//go:generate go run ./generator/providers_generator.go
package oembed
import (
"regexp"
)
var providers []*ProviderEndpoint
func init() {
providers = []*ProviderEndpoint{
{{- range .Endpoints }}
{
URL: "{{ .URL }}",
Patterns: []*regexp.Regexp{
{{- range .Patterns }}
regexp.MustCompile(`{{ . }}`),
{{- end }}
},
},
{{- end }}
}
}

View File

@ -0,0 +1,166 @@
// Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved.
// See LICENSE.txt for license information.
package main
import (
"encoding/json"
"fmt"
"net/url"
"os"
"regexp"
"slices"
"strings"
"text/template"
"github.com/mattermost/mattermost/server/v8/channels/app/oembed"
"github.com/pkg/errors"
)
// To update the list of oEmbed providers that we support:
// 1. Download the latest providers.json file from https://oembed.com/providers.json and place it in this folder
// 2. If desired, update supportedProviders below to add the names of additional oEmbed providers that we want to use
// 3. Run `go generate ./channels/app/oembed` from the server folder
var (
// supportedProviders contains the names of all of the oEmbed providers that we currently support.
//
// As of writing, we're only going to support YouTube because they've stopped giving us the required OpenGraph
// metadata. When we want to support oEmbed embeds for other providers, this will need to be updated.
supportedProviders = []string{
"YouTube",
}
outputTemplate = template.Must(template.New("providers.go.tmpl").ParseFiles("./generator/providers.go.tmpl"))
)
type oEmbedProvider struct {
ProviderName string `json:"provider_name"`
ProviderURL string `json:"provider_url"`
Endpoints []*oEmbedEndpoint `json:"endpoints"`
}
type oEmbedEndpoint struct {
Schemes []string `json:"schemes,omitempty"`
URL string `json:"url"`
Discovery bool `json:"discovery,omitempty"`
Formats []string `json:"formats,omitempty"`
}
func main() {
inputJson, err := os.ReadFile("./generator/providers.json")
if err != nil {
panic(errors.Wrap(err, "Unable to read providers.json. Did you forget to put it next to providers_generator.go?"))
}
outputFile, err := os.Create("./providers_gen.go")
if err != nil {
panic(errors.Wrap(err, "Unable to open output file"))
}
defer outputFile.Close()
var input []*oEmbedProvider
err = json.Unmarshal(inputJson, &input)
if err != nil {
panic(errors.Wrap(err, "Unable to read providers.json"))
}
var endpoints []*oembed.ProviderEndpoint
for _, inputProvider := range input {
if !slices.Contains(supportedProviders, inputProvider.ProviderName) {
continue
}
providerEndpoints, extractErr := extractEndpointsFromProvider(inputProvider)
if extractErr != nil {
panic(errors.Wrap(extractErr, "Unable to convert oEmbedProvider from providers.json to a ProviderEndpoint"))
}
endpoints = append(endpoints, providerEndpoints...)
}
err = outputTemplate.Execute(outputFile, map[string]any{
"Endpoints": endpoints,
})
if err != nil {
panic(errors.Wrap(err, "Unable to write file using template"))
}
}
// extractEndpointsFromProvider turns the data for one provider into providers.json into multiple, more compact ProviderEndpoints
func extractEndpointsFromProvider(in *oEmbedProvider) ([]*oembed.ProviderEndpoint, error) {
var out []*oembed.ProviderEndpoint
for _, endpoint := range in.Endpoints {
// Ensure that the endpoint URL is valid so that we don't need to error check it at runtime
_, err := url.Parse(endpoint.URL)
if err != nil {
return nil, err
}
var patterns []*regexp.Regexp
for _, scheme := range endpoint.Schemes {
pattern, err := schemeToPattern(scheme)
if err != nil {
return nil, err
}
patterns = append(patterns, pattern)
}
if len(patterns) > 0 {
out = append(out, &oembed.ProviderEndpoint{
URL: endpoint.URL,
Patterns: patterns,
})
}
}
return out, nil
}
func schemeToPattern(scheme string) (*regexp.Regexp, error) {
partsPattern := regexp.MustCompile(`^(\w+:(?://)?)([^/]*)(/[^?]*)?(\?[^?]*)?$`)
parts := partsPattern.FindStringSubmatch(scheme)
if parts == nil {
return nil, fmt.Errorf("unable to split scheme %s into parts", scheme)
} else if len(parts) != 5 {
return nil, fmt.Errorf("wrong number of parts for scheme %s", scheme)
}
protocol := parts[1]
if protocol != "http://" && protocol != "https://" && protocol != "spotify:" {
return nil, fmt.Errorf("unrecognized protocol %s for scheme %s", protocol, scheme)
}
domain := parts[2]
if domain == "" {
return nil, fmt.Errorf("no domain found for scheme %s", scheme)
}
path := parts[3]
if path == "" && protocol != "spotify:" {
return nil, fmt.Errorf("no path found for scheme %s", scheme)
}
query := parts[4]
// Replace any valid wildcards with a temporary character so that we can escape any regexp special characters
domain = strings.Replace(domain, "*", "%", -1)
path = strings.Replace(path, "*", "%", -1)
query = strings.Replace(query, "*", "%", -1)
// Escape any other special characters
protocol = regexp.QuoteMeta(protocol)
domain = regexp.QuoteMeta(domain)
path = regexp.QuoteMeta(path)
query = regexp.QuoteMeta(query)
// Replace the temporary character with the proper regexp to match a wildcard in that part of the URL
domain = strings.Replace(domain, "%", "[^/]*?", -1)
path = strings.Replace(path, "%", ".*?", -1)
query = strings.Replace(query, "%", ".*?", -1)
// Allow http schemes to match https URLs as well
if protocol == "http://" {
protocol = "https?://"
}
return regexp.Compile("^" + protocol + domain + path + query + "$")
}

View File

@ -0,0 +1,56 @@
// Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved.
// See LICENSE.txt for license information.
package oembed
import (
"encoding/json"
"fmt"
"io"
)
type OEmbedResponse struct {
// Type can be one of "photo", "video", "link", or "rich"
Type string `json:"type"`
// Fields that may be defined for any response type
Version string `json:"version"`
Title string `json:"title,omitempty"`
AuthorName string `json:"author_name,omitempty"`
AuthorURL string `json:"author_url,omitempty"`
ProviderName string `json:"provider_name,omitempty"`
ProviderURL string `json:"provider_url,omitempty"`
CacheAge string `json:"cache_age,omitempty"`
ThumbnailURL string `json:"thumbnail_url,omitempty"`
ThumbnailWidth int `json:"thumbnail_width,omitempty"`
ThumbnailHeight int `json:"thumbnail_height,omitempty"`
// Fields that are required for responses with the type "photo"
URL string `json:"url"`
// Fields that are required for responses of the type "video" or "rich"
HTML string `json:"html"`
// Fields that are required for responses with the type "photo", "video", or "rich"
Width int `json:"width"`
Height int `json:"height"`
}
func ResponseFromJSON(r io.Reader) (*OEmbedResponse, error) {
var response OEmbedResponse
err := json.NewDecoder(r).Decode(&response)
if err != nil {
return nil, err
}
// Do a quick smoke test to confirm that this is hopefully a valid oEmbed response
if response.Version != "1.0" {
return nil, fmt.Errorf("ResponseFromJson: Received unsupported response version %s", response.Version)
}
if response.Type != "photo" && response.Type != "video" && response.Type != "link" && response.Type != "rich" {
return nil, fmt.Errorf("ResponseFromJson: Received unsupported response type %s", response.Type)
}
return &response, nil
}

View File

@ -0,0 +1,35 @@
// Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved.
// See LICENSE.txt for license information.
// Code generated by "go generate ./channels/app/oembed"
// DO NOT EDIT
//go:generate go run ./generator/providers_generator.go
package oembed
import (
"regexp"
)
var providers []*ProviderEndpoint
func init() {
providers = []*ProviderEndpoint{
{
URL: "https://www.youtube.com/oembed",
Patterns: []*regexp.Regexp{
regexp.MustCompile(`^https://[^/]*?\.youtube\.com/watch.*?$`),
regexp.MustCompile(`^https://[^/]*?\.youtube\.com/v/.*?$`),
regexp.MustCompile(`^https://youtu\.be/.*?$`),
regexp.MustCompile(`^https://[^/]*?\.youtube\.com/playlist\?list=.*?$`),
regexp.MustCompile(`^https://youtube\.com/playlist\?list=.*?$`),
regexp.MustCompile(`^https://[^/]*?\.youtube\.com/shorts.*?$`),
regexp.MustCompile(`^https://youtube\.com/shorts.*?$`),
regexp.MustCompile(`^https://[^/]*?\.youtube\.com/embed/.*?$`),
regexp.MustCompile(`^https://[^/]*?\.youtube\.com/live.*?$`),
regexp.MustCompile(`^https://youtube\.com/live.*?$`),
},
},
}
}

View File

@ -10,9 +10,12 @@ import (
"time"
"github.com/dyatlov/go-opengraph/opengraph"
ogImage "github.com/dyatlov/go-opengraph/opengraph/types/image"
"github.com/pkg/errors"
"golang.org/x/net/html/charset"
"github.com/mattermost/mattermost/server/public/shared/mlog"
"github.com/mattermost/mattermost/server/v8/channels/app/oembed"
)
const (
@ -144,3 +147,31 @@ func openGraphDecodeHTMLEntities(og *opengraph.OpenGraph) {
og.Title = html.UnescapeString(og.Title)
og.Description = html.UnescapeString(og.Description)
}
func (a *App) parseOpenGraphFromOEmbed(requestURL string, body io.Reader) (*opengraph.OpenGraph, error) {
oEmbedResponse, err := oembed.ResponseFromJSON(io.LimitReader(body, MaxOpenGraphResponseSize))
if err != nil {
return nil, errors.Wrap(err, "parseOpenGraphFromOEmbed: Unable to parse oEmbed response")
}
og := &opengraph.OpenGraph{
Type: "opengraph",
Title: oEmbedResponse.Title,
URL: requestURL,
}
if oEmbedResponse.ThumbnailURL != "" {
og.Images = append(og.Images, &ogImage.Image{
Type: "image",
URL: oEmbedResponse.ThumbnailURL,
Width: uint64(oEmbedResponse.ThumbnailWidth),
Height: uint64(oEmbedResponse.ThumbnailHeight),
})
}
if toProxyURL := a.ImageProxyAdder(); toProxyURL != nil {
og = openGraphDataWithProxyAddedToImageURLs(og, toProxyURL)
}
return og, nil
}

View File

@ -17,12 +17,14 @@ import (
"time"
"github.com/dyatlov/go-opengraph/opengraph"
"github.com/pkg/errors"
"golang.org/x/net/idna"
"github.com/mattermost/mattermost/server/public/model"
"github.com/mattermost/mattermost/server/public/shared/markdown"
"github.com/mattermost/mattermost/server/public/shared/mlog"
"github.com/mattermost/mattermost/server/public/shared/request"
"github.com/mattermost/mattermost/server/v8/channels/app/oembed"
"github.com/mattermost/mattermost/server/v8/channels/app/platform"
"github.com/mattermost/mattermost/server/v8/channels/utils/imgutils"
)
@ -646,84 +648,17 @@ func (a *App) getLinkMetadata(c request.CTX, requestURL string, timestamp int64,
var err error
if looksLikeAPermalink(requestURL, a.GetSiteURL()) && *a.Config().ServiceSettings.EnablePermalinkPreviews {
referencedPostID := requestURL[len(requestURL)-26:]
permalink, err = a.getLinkMetadataForPermalink(c, requestURL)
referencedPost, appErr := a.GetSinglePost(c, referencedPostID, false)
// TODO: Look into saving a value in the LinkMetadata.Data field to prevent perpetually re-querying for the deleted post.
if appErr != nil {
return nil, nil, nil, appErr
}
referencedChannel, appErr := a.GetChannel(c, referencedPost.ChannelId)
if appErr != nil {
return nil, nil, nil, appErr
}
var referencedTeam *model.Team
if referencedChannel.Type == model.ChannelTypeDirect || referencedChannel.Type == model.ChannelTypeGroup {
referencedTeam = &model.Team{}
} else {
referencedTeam, appErr = a.GetTeam(referencedChannel.TeamId)
if appErr != nil {
return nil, nil, nil, appErr
}
}
// Get metadata for embedded post
if a.containsPermalink(c, referencedPost) {
// referencedPost contains a permalink: we don't get its metadata
permalink = &model.Permalink{PreviewPost: model.NewPreviewPost(referencedPost, referencedTeam, referencedChannel)}
} else {
// referencedPost does not contain a permalink: we get its metadata
referencedPostWithMetadata := a.PreparePostForClientWithEmbedsAndImages(c, referencedPost, false, false, false)
permalink = &model.Permalink{PreviewPost: model.NewPreviewPost(referencedPostWithMetadata, referencedTeam, referencedChannel)}
}
} else {
var request *http.Request
// Make request for a web page or an image
request, err = http.NewRequest("GET", requestURL, nil)
if err != nil {
return nil, nil, nil, err
}
} else if oEmbedProvider := oembed.FindEndpointForURL(requestURL); oEmbedProvider != nil {
og, err = a.getLinkMetadataFromOEmbed(c, requestURL, oEmbedProvider)
} else {
og, image, err = a.getLinkMetadataForURL(c, requestURL)
var body io.ReadCloser
var contentType string
if (request.URL.Scheme+"://"+request.URL.Host) == a.GetSiteURL() && request.URL.Path == "/api/v4/image" {
// /api/v4/image requires authentication, so bypass the API by hitting the proxy directly
body, contentType, err = a.ImageProxy().GetImageDirect(a.ImageProxy().GetUnproxiedImageURL(request.URL.String()))
} else {
request.Header.Add("Accept", "image/*")
request.Header.Add("Accept", "text/html;q=0.8")
request.Header.Add("Accept-Language", *a.Config().LocalizationSettings.DefaultServerLocale)
client := a.HTTPService().MakeClient(false)
client.Timeout = time.Duration(*a.Config().ExperimentalSettings.LinkMetadataTimeoutMilliseconds) * time.Millisecond
var res *http.Response
res, err = client.Do(request)
if err != nil {
c.Logger().Warn("error fetching OG image data", mlog.Err(err))
}
if res != nil {
body = res.Body
contentType = res.Header.Get("Content-Type")
}
}
if body != nil {
defer func() {
io.Copy(io.Discard, body)
body.Close()
}()
}
if err == nil {
// Parse the data
og, image, err = a.parseLinkMetadata(requestURL, body, contentType)
}
og = model.TruncateOpenGraph(og) // remove unwanted length of texts
// We intentionally don't return early on an error because we want to save that there is no metadata for this link
a.saveLinkMetadataToDatabase(requestURL, timestamp, og, image)
}
@ -734,6 +669,123 @@ func (a *App) getLinkMetadata(c request.CTX, requestURL string, timestamp int64,
return og, image, permalink, err
}
func (a *App) getLinkMetadataForPermalink(c request.CTX, requestURL string) (*model.Permalink, error) {
referencedPostID := requestURL[len(requestURL)-26:]
referencedPost, appErr := a.GetSinglePost(c, referencedPostID, false)
// TODO: Look into saving a value in the LinkMetadata.Data field to prevent perpetually re-querying for the deleted post.
if appErr != nil {
return nil, appErr
}
referencedChannel, appErr := a.GetChannel(c, referencedPost.ChannelId)
if appErr != nil {
return nil, appErr
}
var referencedTeam *model.Team
if referencedChannel.Type == model.ChannelTypeDirect || referencedChannel.Type == model.ChannelTypeGroup {
referencedTeam = &model.Team{}
} else {
referencedTeam, appErr = a.GetTeam(referencedChannel.TeamId)
if appErr != nil {
return nil, appErr
}
}
// Get metadata for embedded post
var permalink *model.Permalink
if a.containsPermalink(c, referencedPost) {
// referencedPost contains a permalink: we don't get its metadata
permalink = &model.Permalink{PreviewPost: model.NewPreviewPost(referencedPost, referencedTeam, referencedChannel)}
} else {
// referencedPost does not contain a permalink: we get its metadata
referencedPostWithMetadata := a.PreparePostForClientWithEmbedsAndImages(c, referencedPost, false, false, false)
permalink = &model.Permalink{PreviewPost: model.NewPreviewPost(referencedPostWithMetadata, referencedTeam, referencedChannel)}
}
return permalink, nil
}
func (a *App) getLinkMetadataFromOEmbed(c request.CTX, requestURL string, provider *oembed.ProviderEndpoint) (*opengraph.OpenGraph, error) {
request, err := http.NewRequest("GET", provider.GetProviderURL(requestURL), nil)
if err != nil {
return nil, err
}
request.Header.Add("Accept", "application/json")
request.Header.Add("Accept-Language", *a.Config().LocalizationSettings.DefaultServerLocale)
client := a.HTTPService().MakeClient(false)
client.Timeout = time.Duration(*a.Config().ExperimentalSettings.LinkMetadataTimeoutMilliseconds) * time.Millisecond
res, err := client.Do(request)
if err != nil {
c.Logger().Warn("error fetching oEmbed data", mlog.Err(err))
return nil, errors.Wrap(err, "getLinkMetadataFromOEmbed: Unable to get oEmbed data")
}
defer func() {
io.Copy(io.Discard, res.Body)
res.Body.Close()
}()
return a.parseOpenGraphFromOEmbed(requestURL, res.Body)
}
func (a *App) getLinkMetadataForURL(c request.CTX, requestURL string) (*opengraph.OpenGraph, *model.PostImage, error) {
var request *http.Request
// Make request for a web page or an image
request, err := http.NewRequest("GET", requestURL, nil)
if err != nil {
return nil, nil, err
}
var body io.ReadCloser
var contentType string
if (request.URL.Scheme+"://"+request.URL.Host) == a.GetSiteURL() && request.URL.Path == "/api/v4/image" {
// /api/v4/image requires authentication, so bypass the API by hitting the proxy directly
body, contentType, err = a.ImageProxy().GetImageDirect(a.ImageProxy().GetUnproxiedImageURL(request.URL.String()))
} else {
request.Header.Add("Accept", "image/*")
request.Header.Add("Accept", "text/html;q=0.8")
request.Header.Add("Accept-Language", *a.Config().LocalizationSettings.DefaultServerLocale)
client := a.HTTPService().MakeClient(false)
client.Timeout = time.Duration(*a.Config().ExperimentalSettings.LinkMetadataTimeoutMilliseconds) * time.Millisecond
var res *http.Response
res, err = client.Do(request)
if err != nil {
c.Logger().Warn("error fetching OG image data", mlog.Err(err))
}
if res != nil {
body = res.Body
contentType = res.Header.Get("Content-Type")
}
}
if body != nil {
defer func() {
io.Copy(io.Discard, body)
body.Close()
}()
}
var og *opengraph.OpenGraph
var image *model.PostImage
if err == nil {
// Parse the data
og, image, err = a.parseLinkMetadata(requestURL, body, contentType)
}
og = model.TruncateOpenGraph(og) // remove unwanted length of texts
return og, image, err
}
// resolveMetadataURL resolves a given URL relative to the server's site URL.
func resolveMetadataURL(requestURL string, siteURL string) string {
base, err := url.Parse(siteURL)