MM-55267 Add ability for server-side Markdown code to understand emojis (#25332)

* MM-55267 Add ability for server-side Markdown code to understand emojis

* Remove unused regex
This commit is contained in:
Harrison Healey 2023-11-13 14:38:05 -05:00 committed by GitHub
parent 448d442a0b
commit 9397970644
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 277 additions and 1 deletions

View File

@ -0,0 +1,42 @@
// Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved.
// See LICENSE.txt for license information.
package markdown
import (
"regexp"
)
// Based off the mobile app's emoji parsing from https://github.com/mattermost/commonmark.js
var (
emojiRegex = regexp.MustCompile(`^:([a-z0-9_\-+]+):\B`)
)
// parseEmoji attempts to parse a named emoji (eg. :taco:) starting at the current parser position. If an emoji is
// found, it adds that to p.inlines and returns true. Otherwise, it returns false.
func (p *inlineParser) parseEmoji() bool {
// Only allow emojis after non-word characters
if p.position > 1 {
prevChar := p.raw[p.position-1]
if isWordByte(prevChar) {
return false
}
}
remaining := p.raw[p.position:]
loc := emojiRegex.FindStringIndex(remaining)
if loc == nil {
return false
}
// Note that there may not be a system or custom emoji that exists with this name
p.inlines = append(p.inlines, &Emoji{
Name: remaining[loc[0]+1 : loc[1]-1],
})
p.position += loc[1] - loc[0]
return true
}

View File

@ -0,0 +1,203 @@
// Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved.
// See LICENSE.txt for license information.
package markdown
import (
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestParseEmoji(t *testing.T) {
for name, tc := range map[string]struct {
Input string
Position int
ExpectedOk bool
ExpectedPosition int
ExpectedEmojiName string
}{
"just a colon": {
Input: ":",
Position: 0,
ExpectedOk: false,
ExpectedPosition: 0,
},
"no closing colon": {
Input: ":emoji",
Position: 0,
ExpectedOk: false,
ExpectedPosition: 0,
},
"no closing colon before whitespace": {
Input: ":emoji example",
Position: 0,
ExpectedOk: false,
ExpectedPosition: 0,
},
"valid emoji": {
Input: ":emoji:",
Position: 0,
ExpectedOk: true,
ExpectedPosition: 7,
ExpectedEmojiName: "emoji",
},
"valid emoji with punctuation": {
Input: ":valid-emoji:",
Position: 0,
ExpectedOk: true,
ExpectedPosition: 13,
ExpectedEmojiName: "valid-emoji",
},
"valid emoji with text before": {
Input: "this is an :emoji:",
Position: 11,
ExpectedOk: true,
ExpectedPosition: 18,
ExpectedEmojiName: "emoji",
},
"invalid emoji with text before": {
Input: "this is not an :emoji",
Position: 15,
ExpectedOk: false,
ExpectedPosition: 15,
},
"valid emoji with text after": {
Input: ":emoji: before some text",
Position: 0,
ExpectedOk: true,
ExpectedPosition: 7,
ExpectedEmojiName: "emoji",
},
"valid emoji with text before and after": {
Input: "this is an :emoji: in a sentence",
Position: 11,
ExpectedOk: true,
ExpectedPosition: 18,
ExpectedEmojiName: "emoji",
},
"multiple emojis 1": {
Input: ":multiple: :emojis:",
Position: 0,
ExpectedOk: true,
ExpectedPosition: 10,
ExpectedEmojiName: "multiple",
},
"multiple emojis 2": {
Input: ":multiple: :emojis:",
Position: 11,
ExpectedOk: true,
ExpectedPosition: 19,
ExpectedEmojiName: "emojis",
},
} {
t.Run(name, func(t *testing.T) {
p := newInlineParser(tc.Input, []Range{}, []*ReferenceDefinition{})
p.raw = tc.Input
p.position = tc.Position
ok := p.parseEmoji()
assert.Equal(t, tc.ExpectedOk, ok)
assert.Equal(t, tc.ExpectedPosition, p.position)
if tc.ExpectedOk {
require.True(t, len(p.inlines) > 0)
require.IsType(t, &Emoji{}, p.inlines[len(p.inlines)-1])
assert.Equal(t, tc.ExpectedEmojiName, p.inlines[len(p.inlines)-1].(*Emoji).Name)
}
})
}
}
func TestParseEmojiFull(t *testing.T) {
// These tests are based on https://github.com/mattermost/commonmark.js/blob/master/test/mattermost.txt
for name, tc := range map[string]struct {
Markdown string
ExpectedHTML string
}{
// Valid emojis
"emoji": {
Markdown: "This is an :emoji:",
ExpectedHTML: `<p>This is an <span data-emoji-name="emoji" data-literal=":emoji:" /></p>`,
},
"emoji with underscore": {
Markdown: "This is an :emo_ji:",
ExpectedHTML: `<p>This is an <span data-emoji-name="emo_ji" data-literal=":emo_ji:" /></p>`,
},
"emoji with hyphen": {
Markdown: "This is an :emo-ji:",
ExpectedHTML: `<p>This is an <span data-emoji-name="emo-ji" data-literal=":emo-ji:" /></p>`,
},
"emoji with numbers": {
Markdown: "This is an :emoji123:",
ExpectedHTML: `<p>This is an <span data-emoji-name="emoji123" data-literal=":emoji123:" /></p>`,
},
"emoji in brackets": {
Markdown: "This is an (:emoji:)",
ExpectedHTML: `<p>This is an (<span data-emoji-name="emoji" data-literal=":emoji:" />)</p>`,
},
"two emojis without space between": {
Markdown: "These are some :emoji1::emoji2:",
ExpectedHTML: `<p>These are some <span data-emoji-name="emoji1" data-literal=":emoji1:" /><span data-emoji-name="emoji2" data-literal=":emoji2:" /></p>`,
},
"two emojis separated by a slash": {
Markdown: "These are some :emoji1:/:emoji2:",
ExpectedHTML: `<p>These are some <span data-emoji-name="emoji1" data-literal=":emoji1:" />/<span data-emoji-name="emoji2" data-literal=":emoji2:" /></p>`,
},
"+1 emoji": {
Markdown: "This is an :+1:",
ExpectedHTML: `<p>This is an <span data-emoji-name="+1" data-literal=":+1:" /></p>`,
},
"-1 emoji": {
Markdown: "This is an :-1:",
ExpectedHTML: `<p>This is an <span data-emoji-name="-1" data-literal=":-1:" /></p>`,
},
"emoji with surrounding words": {
Markdown: "This is an :emoji: in a sentence.",
ExpectedHTML: `<p>This is an <span data-emoji-name="emoji" data-literal=":emoji:" /> in a sentence.</p>`,
},
// Invalid emojis
"incomplete emoji 1": {
Markdown: "This is not an :emoji",
ExpectedHTML: `<p>This is not an :emoji</p>`,
},
"incomplete emoji 2": {
Markdown: "This is not an emoji:",
ExpectedHTML: `<p>This is not an emoji:</p>`,
},
"invalid emoji with whitespace": {
Markdown: "This is not an :emo ji:",
ExpectedHTML: `<p>This is not an :emo ji:</p>`,
},
"invalid emoji with other punctuation": {
Markdown: "This is not an :emo'ji:",
ExpectedHTML: `<p>This is not an :emo'ji:</p>`,
},
"invalid emoji due to adjacent text 1": {
Markdown: "Thisisnotan:emoji:",
// This differs slightly from our commonmark.js implementation because it doesn't require :// when autolinking
ExpectedHTML: `<p>Thisisnotan:emoji:</p>`,
},
"invalid emoji due to adjacent text 2": {
Markdown: "This is not an :emoji:isit",
// This differs slightly from our commonmark.js implementation because it doesn't require :// when autolinking
ExpectedHTML: `<p>This is not an :emoji:isit</p>`,
},
"invalid emoji due to adjacent text 3": {
Markdown: "This is not an:emoji:isit",
// This differs slightly from our commonmark.js implementation because it doesn't require :// when autolinking
ExpectedHTML: `<p>This is not an:emoji:isit</p>`,
},
} {
t.Run(name, func(t *testing.T) {
actual := RenderHTML(tc.Markdown)
assert.Equal(t, tc.ExpectedHTML, actual)
})
}
}

View File

@ -162,6 +162,10 @@ func RenderInlineHTML(inline Inline) (result string) {
result += RenderInlineHTML(inline)
}
result += "</a>"
case *Emoji:
escapedName := htmlEscaper.Replace(v.Name)
result += fmt.Sprintf(`<span data-emoji-name="%s" data-literal=":%s:" />`, escapedName, escapedName)
default:
panic(fmt.Sprintf("missing case for type %T", v))
}

View File

@ -101,6 +101,12 @@ func (i *Autolink) Destination() string {
return destination
}
type Emoji struct {
inlineBase
Name string
}
type delimiterType int
const (
@ -575,12 +581,24 @@ func (p *inlineParser) Parse() []Inline {
p.parseLinkOrImageDelimiter()
case ']':
p.lookForLinkOrImage()
case 'w', 'W', ':':
case 'w', 'W':
matched := p.parseAutolink(c)
if !matched {
p.parseText()
}
case ':':
matched := p.parseAutolink(c)
if matched {
continue
}
matched = p.parseEmoji()
if matched {
continue
}
p.parseText()
default:
p.parseText()
}

View File

@ -55,6 +55,15 @@ func isAlphanumericByte(c byte) bool {
return isAlphanumeric(rune(c))
}
// isWord returns true if c matches the \w regexp character class
func isWord(c rune) bool {
return isAlphanumeric(c) || c == '_'
}
func isWordByte(c byte) bool {
return isWord(rune(c))
}
func nextNonWhitespace(markdown string, position int) int {
for offset, c := range []byte(markdown[position:]) {
if !isWhitespaceByte(c) {