From 939797064439c73968cb9ebceabebc13e91dc809 Mon Sep 17 00:00:00 2001
From: Harrison Healey <harrisonmhealey@gmail.com>
Date: Mon, 13 Nov 2023 14:38:05 -0500
Subject: [PATCH] MM-55267 Add ability for server-side Markdown code to
 understand emojis (#25332)

* MM-55267 Add ability for server-side Markdown code to understand emojis

* Remove unused regex
---
 server/public/shared/markdown/emoji.go      |  42 ++++
 server/public/shared/markdown/emoji_test.go | 203 ++++++++++++++++++++
 server/public/shared/markdown/html.go       |   4 +
 server/public/shared/markdown/inlines.go    |  20 +-
 server/public/shared/markdown/markdown.go   |   9 +
 5 files changed, 277 insertions(+), 1 deletion(-)
 create mode 100644 server/public/shared/markdown/emoji.go
 create mode 100644 server/public/shared/markdown/emoji_test.go

diff --git a/server/public/shared/markdown/emoji.go b/server/public/shared/markdown/emoji.go
new file mode 100644
index 0000000000..8d2aaf9d9a
--- /dev/null
+++ b/server/public/shared/markdown/emoji.go
@@ -0,0 +1,42 @@
+// Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved.
+// See LICENSE.txt for license information.
+
+package markdown
+
+import (
+	"regexp"
+)
+
+// Based off the mobile app's emoji parsing from https://github.com/mattermost/commonmark.js
+
+var (
+	emojiRegex = regexp.MustCompile(`^:([a-z0-9_\-+]+):\B`)
+)
+
+// parseEmoji attempts to parse a named emoji (eg. :taco:) starting at the current parser position. If an emoji is
+// found, it adds that to p.inlines and returns true. Otherwise, it returns false.
+func (p *inlineParser) parseEmoji() bool {
+	// Only allow emojis after non-word characters
+	if p.position > 1 {
+		prevChar := p.raw[p.position-1]
+
+		if isWordByte(prevChar) {
+			return false
+		}
+	}
+
+	remaining := p.raw[p.position:]
+
+	loc := emojiRegex.FindStringIndex(remaining)
+	if loc == nil {
+		return false
+	}
+
+	// Note that there may not be a system or custom emoji that exists with this name
+	p.inlines = append(p.inlines, &Emoji{
+		Name: remaining[loc[0]+1 : loc[1]-1],
+	})
+	p.position += loc[1] - loc[0]
+
+	return true
+}
diff --git a/server/public/shared/markdown/emoji_test.go b/server/public/shared/markdown/emoji_test.go
new file mode 100644
index 0000000000..b2a4959700
--- /dev/null
+++ b/server/public/shared/markdown/emoji_test.go
@@ -0,0 +1,203 @@
+// Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved.
+// See LICENSE.txt for license information.
+
+package markdown
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestParseEmoji(t *testing.T) {
+	for name, tc := range map[string]struct {
+		Input             string
+		Position          int
+		ExpectedOk        bool
+		ExpectedPosition  int
+		ExpectedEmojiName string
+	}{
+		"just a colon": {
+			Input:            ":",
+			Position:         0,
+			ExpectedOk:       false,
+			ExpectedPosition: 0,
+		},
+		"no closing colon": {
+			Input:            ":emoji",
+			Position:         0,
+			ExpectedOk:       false,
+			ExpectedPosition: 0,
+		},
+		"no closing colon before whitespace": {
+			Input:            ":emoji example",
+			Position:         0,
+			ExpectedOk:       false,
+			ExpectedPosition: 0,
+		},
+		"valid emoji": {
+			Input:             ":emoji:",
+			Position:          0,
+			ExpectedOk:        true,
+			ExpectedPosition:  7,
+			ExpectedEmojiName: "emoji",
+		},
+		"valid emoji with punctuation": {
+			Input:             ":valid-emoji:",
+			Position:          0,
+			ExpectedOk:        true,
+			ExpectedPosition:  13,
+			ExpectedEmojiName: "valid-emoji",
+		},
+		"valid emoji with text before": {
+			Input:             "this is an :emoji:",
+			Position:          11,
+			ExpectedOk:        true,
+			ExpectedPosition:  18,
+			ExpectedEmojiName: "emoji",
+		},
+		"invalid emoji with text before": {
+			Input:            "this is not an :emoji",
+			Position:         15,
+			ExpectedOk:       false,
+			ExpectedPosition: 15,
+		},
+		"valid emoji with text after": {
+			Input:             ":emoji: before some text",
+			Position:          0,
+			ExpectedOk:        true,
+			ExpectedPosition:  7,
+			ExpectedEmojiName: "emoji",
+		},
+		"valid emoji with text before and after": {
+			Input:             "this is an :emoji: in a sentence",
+			Position:          11,
+			ExpectedOk:        true,
+			ExpectedPosition:  18,
+			ExpectedEmojiName: "emoji",
+		},
+		"multiple emojis 1": {
+			Input:             ":multiple: :emojis:",
+			Position:          0,
+			ExpectedOk:        true,
+			ExpectedPosition:  10,
+			ExpectedEmojiName: "multiple",
+		},
+		"multiple emojis 2": {
+			Input:             ":multiple: :emojis:",
+			Position:          11,
+			ExpectedOk:        true,
+			ExpectedPosition:  19,
+			ExpectedEmojiName: "emojis",
+		},
+	} {
+		t.Run(name, func(t *testing.T) {
+			p := newInlineParser(tc.Input, []Range{}, []*ReferenceDefinition{})
+			p.raw = tc.Input
+			p.position = tc.Position
+
+			ok := p.parseEmoji()
+
+			assert.Equal(t, tc.ExpectedOk, ok)
+			assert.Equal(t, tc.ExpectedPosition, p.position)
+			if tc.ExpectedOk {
+				require.True(t, len(p.inlines) > 0)
+				require.IsType(t, &Emoji{}, p.inlines[len(p.inlines)-1])
+				assert.Equal(t, tc.ExpectedEmojiName, p.inlines[len(p.inlines)-1].(*Emoji).Name)
+			}
+		})
+	}
+}
+
+func TestParseEmojiFull(t *testing.T) {
+	// These tests are based on https://github.com/mattermost/commonmark.js/blob/master/test/mattermost.txt
+
+	for name, tc := range map[string]struct {
+		Markdown     string
+		ExpectedHTML string
+	}{
+		// Valid emojis
+
+		"emoji": {
+			Markdown:     "This is an :emoji:",
+			ExpectedHTML: `<p>This is an <span data-emoji-name="emoji" data-literal=":emoji:" /></p>`,
+		},
+		"emoji with underscore": {
+			Markdown:     "This is an :emo_ji:",
+			ExpectedHTML: `<p>This is an <span data-emoji-name="emo_ji" data-literal=":emo_ji:" /></p>`,
+		},
+		"emoji with hyphen": {
+			Markdown:     "This is an :emo-ji:",
+			ExpectedHTML: `<p>This is an <span data-emoji-name="emo-ji" data-literal=":emo-ji:" /></p>`,
+		},
+		"emoji with numbers": {
+			Markdown:     "This is an :emoji123:",
+			ExpectedHTML: `<p>This is an <span data-emoji-name="emoji123" data-literal=":emoji123:" /></p>`,
+		},
+		"emoji in brackets": {
+			Markdown:     "This is an (:emoji:)",
+			ExpectedHTML: `<p>This is an (<span data-emoji-name="emoji" data-literal=":emoji:" />)</p>`,
+		},
+		"two emojis without space between": {
+			Markdown:     "These are some :emoji1::emoji2:",
+			ExpectedHTML: `<p>These are some <span data-emoji-name="emoji1" data-literal=":emoji1:" /><span data-emoji-name="emoji2" data-literal=":emoji2:" /></p>`,
+		},
+		"two emojis separated by a slash": {
+			Markdown:     "These are some :emoji1:/:emoji2:",
+			ExpectedHTML: `<p>These are some <span data-emoji-name="emoji1" data-literal=":emoji1:" />/<span data-emoji-name="emoji2" data-literal=":emoji2:" /></p>`,
+		},
+		"+1 emoji": {
+			Markdown:     "This is an :+1:",
+			ExpectedHTML: `<p>This is an <span data-emoji-name="+1" data-literal=":+1:" /></p>`,
+		},
+		"-1 emoji": {
+			Markdown:     "This is an :-1:",
+			ExpectedHTML: `<p>This is an <span data-emoji-name="-1" data-literal=":-1:" /></p>`,
+		},
+		"emoji with surrounding words": {
+			Markdown:     "This is an :emoji: in a sentence.",
+			ExpectedHTML: `<p>This is an <span data-emoji-name="emoji" data-literal=":emoji:" /> in a sentence.</p>`,
+		},
+
+		// Invalid emojis
+
+		"incomplete emoji 1": {
+			Markdown:     "This is not an :emoji",
+			ExpectedHTML: `<p>This is not an :emoji</p>`,
+		},
+		"incomplete emoji 2": {
+			Markdown:     "This is not an emoji:",
+			ExpectedHTML: `<p>This is not an emoji:</p>`,
+		},
+		"invalid emoji with whitespace": {
+			Markdown:     "This is not an :emo ji:",
+			ExpectedHTML: `<p>This is not an :emo ji:</p>`,
+		},
+		"invalid emoji with other punctuation": {
+			Markdown:     "This is not an :emo'ji:",
+			ExpectedHTML: `<p>This is not an :emo'ji:</p>`,
+		},
+		"invalid emoji due to adjacent text 1": {
+			Markdown: "Thisisnotan:emoji:",
+			// This differs slightly from our commonmark.js implementation because it doesn't require :// when autolinking
+			ExpectedHTML: `<p>Thisisnotan:emoji:</p>`,
+		},
+		"invalid emoji due to adjacent text 2": {
+			Markdown: "This is not an :emoji:isit",
+			// This differs slightly from our commonmark.js implementation because it doesn't require :// when autolinking
+			ExpectedHTML: `<p>This is not an :emoji:isit</p>`,
+		},
+		"invalid emoji due to adjacent text 3": {
+			Markdown: "This is not an:emoji:isit",
+			// This differs slightly from our commonmark.js implementation because it doesn't require :// when autolinking
+			ExpectedHTML: `<p>This is not an:emoji:isit</p>`,
+		},
+	} {
+		t.Run(name, func(t *testing.T) {
+			actual := RenderHTML(tc.Markdown)
+
+			assert.Equal(t, tc.ExpectedHTML, actual)
+		})
+	}
+}
diff --git a/server/public/shared/markdown/html.go b/server/public/shared/markdown/html.go
index 52583074c5..f15bfb664f 100644
--- a/server/public/shared/markdown/html.go
+++ b/server/public/shared/markdown/html.go
@@ -162,6 +162,10 @@ func RenderInlineHTML(inline Inline) (result string) {
 			result += RenderInlineHTML(inline)
 		}
 		result += "</a>"
+	case *Emoji:
+		escapedName := htmlEscaper.Replace(v.Name)
+		result += fmt.Sprintf(`<span data-emoji-name="%s" data-literal=":%s:" />`, escapedName, escapedName)
+
 	default:
 		panic(fmt.Sprintf("missing case for type %T", v))
 	}
diff --git a/server/public/shared/markdown/inlines.go b/server/public/shared/markdown/inlines.go
index 973ae5ed21..fa10753bca 100644
--- a/server/public/shared/markdown/inlines.go
+++ b/server/public/shared/markdown/inlines.go
@@ -101,6 +101,12 @@ func (i *Autolink) Destination() string {
 	return destination
 }
 
+type Emoji struct {
+	inlineBase
+
+	Name string
+}
+
 type delimiterType int
 
 const (
@@ -575,12 +581,24 @@ func (p *inlineParser) Parse() []Inline {
 			p.parseLinkOrImageDelimiter()
 		case ']':
 			p.lookForLinkOrImage()
-		case 'w', 'W', ':':
+		case 'w', 'W':
 			matched := p.parseAutolink(c)
 
 			if !matched {
 				p.parseText()
 			}
+		case ':':
+			matched := p.parseAutolink(c)
+			if matched {
+				continue
+			}
+
+			matched = p.parseEmoji()
+			if matched {
+				continue
+			}
+
+			p.parseText()
 		default:
 			p.parseText()
 		}
diff --git a/server/public/shared/markdown/markdown.go b/server/public/shared/markdown/markdown.go
index 5ccdad8ced..5d45400a4f 100644
--- a/server/public/shared/markdown/markdown.go
+++ b/server/public/shared/markdown/markdown.go
@@ -55,6 +55,15 @@ func isAlphanumericByte(c byte) bool {
 	return isAlphanumeric(rune(c))
 }
 
+// isWord returns true if c matches the \w regexp character class
+func isWord(c rune) bool {
+	return isAlphanumeric(c) || c == '_'
+}
+
+func isWordByte(c byte) bool {
+	return isWord(rune(c))
+}
+
 func nextNonWhitespace(markdown string, position int) int {
 	for offset, c := range []byte(markdown[position:]) {
 		if !isWhitespaceByte(c) {