From 939797064439c73968cb9ebceabebc13e91dc809 Mon Sep 17 00:00:00 2001 From: Harrison Healey Date: Mon, 13 Nov 2023 14:38:05 -0500 Subject: [PATCH] MM-55267 Add ability for server-side Markdown code to understand emojis (#25332) * MM-55267 Add ability for server-side Markdown code to understand emojis * Remove unused regex --- server/public/shared/markdown/emoji.go | 42 ++++ server/public/shared/markdown/emoji_test.go | 203 ++++++++++++++++++++ server/public/shared/markdown/html.go | 4 + server/public/shared/markdown/inlines.go | 20 +- server/public/shared/markdown/markdown.go | 9 + 5 files changed, 277 insertions(+), 1 deletion(-) create mode 100644 server/public/shared/markdown/emoji.go create mode 100644 server/public/shared/markdown/emoji_test.go diff --git a/server/public/shared/markdown/emoji.go b/server/public/shared/markdown/emoji.go new file mode 100644 index 0000000000..8d2aaf9d9a --- /dev/null +++ b/server/public/shared/markdown/emoji.go @@ -0,0 +1,42 @@ +// Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved. +// See LICENSE.txt for license information. + +package markdown + +import ( + "regexp" +) + +// Based off the mobile app's emoji parsing from https://github.com/mattermost/commonmark.js + +var ( + emojiRegex = regexp.MustCompile(`^:([a-z0-9_\-+]+):\B`) +) + +// parseEmoji attempts to parse a named emoji (eg. :taco:) starting at the current parser position. If an emoji is +// found, it adds that to p.inlines and returns true. Otherwise, it returns false. +func (p *inlineParser) parseEmoji() bool { + // Only allow emojis after non-word characters + if p.position > 1 { + prevChar := p.raw[p.position-1] + + if isWordByte(prevChar) { + return false + } + } + + remaining := p.raw[p.position:] + + loc := emojiRegex.FindStringIndex(remaining) + if loc == nil { + return false + } + + // Note that there may not be a system or custom emoji that exists with this name + p.inlines = append(p.inlines, &Emoji{ + Name: remaining[loc[0]+1 : loc[1]-1], + }) + p.position += loc[1] - loc[0] + + return true +} diff --git a/server/public/shared/markdown/emoji_test.go b/server/public/shared/markdown/emoji_test.go new file mode 100644 index 0000000000..b2a4959700 --- /dev/null +++ b/server/public/shared/markdown/emoji_test.go @@ -0,0 +1,203 @@ +// Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved. +// See LICENSE.txt for license information. + +package markdown + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestParseEmoji(t *testing.T) { + for name, tc := range map[string]struct { + Input string + Position int + ExpectedOk bool + ExpectedPosition int + ExpectedEmojiName string + }{ + "just a colon": { + Input: ":", + Position: 0, + ExpectedOk: false, + ExpectedPosition: 0, + }, + "no closing colon": { + Input: ":emoji", + Position: 0, + ExpectedOk: false, + ExpectedPosition: 0, + }, + "no closing colon before whitespace": { + Input: ":emoji example", + Position: 0, + ExpectedOk: false, + ExpectedPosition: 0, + }, + "valid emoji": { + Input: ":emoji:", + Position: 0, + ExpectedOk: true, + ExpectedPosition: 7, + ExpectedEmojiName: "emoji", + }, + "valid emoji with punctuation": { + Input: ":valid-emoji:", + Position: 0, + ExpectedOk: true, + ExpectedPosition: 13, + ExpectedEmojiName: "valid-emoji", + }, + "valid emoji with text before": { + Input: "this is an :emoji:", + Position: 11, + ExpectedOk: true, + ExpectedPosition: 18, + ExpectedEmojiName: "emoji", + }, + "invalid emoji with text before": { + Input: "this is not an :emoji", + Position: 15, + ExpectedOk: false, + ExpectedPosition: 15, + }, + "valid emoji with text after": { + Input: ":emoji: before some text", + Position: 0, + ExpectedOk: true, + ExpectedPosition: 7, + ExpectedEmojiName: "emoji", + }, + "valid emoji with text before and after": { + Input: "this is an :emoji: in a sentence", + Position: 11, + ExpectedOk: true, + ExpectedPosition: 18, + ExpectedEmojiName: "emoji", + }, + "multiple emojis 1": { + Input: ":multiple: :emojis:", + Position: 0, + ExpectedOk: true, + ExpectedPosition: 10, + ExpectedEmojiName: "multiple", + }, + "multiple emojis 2": { + Input: ":multiple: :emojis:", + Position: 11, + ExpectedOk: true, + ExpectedPosition: 19, + ExpectedEmojiName: "emojis", + }, + } { + t.Run(name, func(t *testing.T) { + p := newInlineParser(tc.Input, []Range{}, []*ReferenceDefinition{}) + p.raw = tc.Input + p.position = tc.Position + + ok := p.parseEmoji() + + assert.Equal(t, tc.ExpectedOk, ok) + assert.Equal(t, tc.ExpectedPosition, p.position) + if tc.ExpectedOk { + require.True(t, len(p.inlines) > 0) + require.IsType(t, &Emoji{}, p.inlines[len(p.inlines)-1]) + assert.Equal(t, tc.ExpectedEmojiName, p.inlines[len(p.inlines)-1].(*Emoji).Name) + } + }) + } +} + +func TestParseEmojiFull(t *testing.T) { + // These tests are based on https://github.com/mattermost/commonmark.js/blob/master/test/mattermost.txt + + for name, tc := range map[string]struct { + Markdown string + ExpectedHTML string + }{ + // Valid emojis + + "emoji": { + Markdown: "This is an :emoji:", + ExpectedHTML: `

This is an

`, + }, + "emoji with underscore": { + Markdown: "This is an :emo_ji:", + ExpectedHTML: `

This is an

`, + }, + "emoji with hyphen": { + Markdown: "This is an :emo-ji:", + ExpectedHTML: `

This is an

`, + }, + "emoji with numbers": { + Markdown: "This is an :emoji123:", + ExpectedHTML: `

This is an

`, + }, + "emoji in brackets": { + Markdown: "This is an (:emoji:)", + ExpectedHTML: `

This is an ()

`, + }, + "two emojis without space between": { + Markdown: "These are some :emoji1::emoji2:", + ExpectedHTML: `

These are some

`, + }, + "two emojis separated by a slash": { + Markdown: "These are some :emoji1:/:emoji2:", + ExpectedHTML: `

These are some /

`, + }, + "+1 emoji": { + Markdown: "This is an :+1:", + ExpectedHTML: `

This is an

`, + }, + "-1 emoji": { + Markdown: "This is an :-1:", + ExpectedHTML: `

This is an

`, + }, + "emoji with surrounding words": { + Markdown: "This is an :emoji: in a sentence.", + ExpectedHTML: `

This is an in a sentence.

`, + }, + + // Invalid emojis + + "incomplete emoji 1": { + Markdown: "This is not an :emoji", + ExpectedHTML: `

This is not an :emoji

`, + }, + "incomplete emoji 2": { + Markdown: "This is not an emoji:", + ExpectedHTML: `

This is not an emoji:

`, + }, + "invalid emoji with whitespace": { + Markdown: "This is not an :emo ji:", + ExpectedHTML: `

This is not an :emo ji:

`, + }, + "invalid emoji with other punctuation": { + Markdown: "This is not an :emo'ji:", + ExpectedHTML: `

This is not an :emo'ji:

`, + }, + "invalid emoji due to adjacent text 1": { + Markdown: "Thisisnotan:emoji:", + // This differs slightly from our commonmark.js implementation because it doesn't require :// when autolinking + ExpectedHTML: `

Thisisnotan:emoji:

`, + }, + "invalid emoji due to adjacent text 2": { + Markdown: "This is not an :emoji:isit", + // This differs slightly from our commonmark.js implementation because it doesn't require :// when autolinking + ExpectedHTML: `

This is not an :emoji:isit

`, + }, + "invalid emoji due to adjacent text 3": { + Markdown: "This is not an:emoji:isit", + // This differs slightly from our commonmark.js implementation because it doesn't require :// when autolinking + ExpectedHTML: `

This is not an:emoji:isit

`, + }, + } { + t.Run(name, func(t *testing.T) { + actual := RenderHTML(tc.Markdown) + + assert.Equal(t, tc.ExpectedHTML, actual) + }) + } +} diff --git a/server/public/shared/markdown/html.go b/server/public/shared/markdown/html.go index 52583074c5..f15bfb664f 100644 --- a/server/public/shared/markdown/html.go +++ b/server/public/shared/markdown/html.go @@ -162,6 +162,10 @@ func RenderInlineHTML(inline Inline) (result string) { result += RenderInlineHTML(inline) } result += "" + case *Emoji: + escapedName := htmlEscaper.Replace(v.Name) + result += fmt.Sprintf(``, escapedName, escapedName) + default: panic(fmt.Sprintf("missing case for type %T", v)) } diff --git a/server/public/shared/markdown/inlines.go b/server/public/shared/markdown/inlines.go index 973ae5ed21..fa10753bca 100644 --- a/server/public/shared/markdown/inlines.go +++ b/server/public/shared/markdown/inlines.go @@ -101,6 +101,12 @@ func (i *Autolink) Destination() string { return destination } +type Emoji struct { + inlineBase + + Name string +} + type delimiterType int const ( @@ -575,12 +581,24 @@ func (p *inlineParser) Parse() []Inline { p.parseLinkOrImageDelimiter() case ']': p.lookForLinkOrImage() - case 'w', 'W', ':': + case 'w', 'W': matched := p.parseAutolink(c) if !matched { p.parseText() } + case ':': + matched := p.parseAutolink(c) + if matched { + continue + } + + matched = p.parseEmoji() + if matched { + continue + } + + p.parseText() default: p.parseText() } diff --git a/server/public/shared/markdown/markdown.go b/server/public/shared/markdown/markdown.go index 5ccdad8ced..5d45400a4f 100644 --- a/server/public/shared/markdown/markdown.go +++ b/server/public/shared/markdown/markdown.go @@ -55,6 +55,15 @@ func isAlphanumericByte(c byte) bool { return isAlphanumeric(rune(c)) } +// isWord returns true if c matches the \w regexp character class +func isWord(c rune) bool { + return isAlphanumeric(c) || c == '_' +} + +func isWordByte(c byte) bool { + return isWord(rune(c)) +} + func nextNonWhitespace(markdown string, position int) int { for offset, c := range []byte(markdown[position:]) { if !isWhitespaceByte(c) {