FEATURE: HTML to Markdown conversion using native JavaScript ES6 classes (#5425)

This commit is contained in:
Vinoth Kannan 2017-12-15 10:28:20 +05:30 committed by GitHub
parent 0a863dd031
commit f0497ee9c4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 426 additions and 24 deletions

View File

@ -9,6 +9,7 @@ import { emojiUrlFor } from 'discourse/lib/text';
import { getRegister } from 'discourse-common/lib/get-owner';
import { findRawTemplate } from 'discourse/lib/raw-templates';
import { determinePostReplaceSelection, clipboardData } from 'discourse/lib/utilities';
import toMarkdown from 'discourse/lib/to-markdown';
import { ajax } from 'discourse/lib/ajax';
import { popupAjaxError } from 'discourse/lib/ajax-error';
import deprecated from 'discourse-common/lib/deprecated';
@ -647,7 +648,7 @@ export default Ember.Component.extend({
const { clipboard, types } = clipboardData(e);
let plainText = clipboard.getData("text/plain");
const html = clipboard.getData("text/html");
let html = clipboard.getData("text/html");
let handled = false;
if (plainText) {
@ -657,30 +658,19 @@ export default Ember.Component.extend({
this.appEvents.trigger('composer:insert-text', table);
handled = true;
}
if (html && html.includes("urn:schemas-microsoft-com:office:word")) {
html = ""; // use plain text data for microsoft word
}
}
if (this.siteSettings.enable_rich_text_paste && html && !handled) {
const placeholder = `${ plainText || I18n.t('pasting') }`;
const self = this;
const markdown = toMarkdown(html);
this.appEvents.trigger('composer:insert-text', placeholder);
handled = true;
ajax('/composer/parse_html', {
type: 'POST',
data: { html }
}).then(response => {
if (response.markdown) {
self.appEvents.trigger('composer:replace-text', placeholder, response.markdown);
} else if (!plainText) {
self.appEvents.trigger('composer:replace-text', placeholder, "");
}
}).catch(error => {
if (!plainText) {
self.appEvents.trigger('composer:replace-text', placeholder, "");
popupAjaxError(error);
}
});
if (!plainText || plainText.length < markdown.length) {
this.appEvents.trigger('composer:insert-text', markdown);
handled = true;
}
}
const uploadFiles = types.includes("Files") && !plainText && !handled;

View File

@ -5,4 +5,4 @@ export default function parseHTML(rawHtml) {
parser.parseComplete(rawHtml);
return builder.dom;
}
}

View File

@ -0,0 +1,285 @@
import parseHTML from 'discourse/helpers/parse-html';
const trimLeft = text => text.replace(/^\s+/,"");
const trimRight = text => text.replace(/\s+$/,"");
class Tag {
constructor(name, prefix = "", suffix = "") {
this.name = name;
this.prefix = prefix;
this.suffix = suffix;
}
decorate(text) {
if (this.prefix || this.suffix) {
return [this.prefix, text, this.suffix].join("");
}
return text;
}
toMarkdown() {
const text = this.element.innerMarkdown();
if (text && text.trim()) {
return this.decorate(text);
}
return text;
}
static blocks() {
return ["address", "article", "aside", "blockquote", "dd", "div", "dl", "dt", "fieldset",
"figcaption", "figure", "footer", "form", "header", "hgroup", "hr", "main", "nav",
"ol", "p", "pre", "section", "table", "ul"];
}
static headings() {
return ["h1", "h2", "h3", "h4", "h5", "h6"];
}
static emphases() {
return [ ["b", "**"], ["strong", "**"], ["i", "_"], ["em", "_"], ["s", "~~"], ["strike", "~~"] ];
}
static slices() {
return ["dt", "dd", "tr", "thead", "tbody", "tfoot"];
}
static trimmable() {
return [...Tag.blocks(), ...Tag.headings(), ...Tag.slices(), "li", "td", "th", "br", "hr"];
}
static block(name, prefix, suffix) {
return class extends Tag {
constructor() {
super(name, prefix, suffix);
}
decorate(text) {
return `\n\n${this.prefix}${text}${this.suffix}\n\n`;
}
};
}
static heading(name, i) {
const prefix = `${[...Array(i)].map(() => "#").join("")} `;
return Tag.block(name, prefix, "");
}
static emphasis(name, decorator) {
return class extends Tag {
constructor() {
super(name, decorator, decorator);
}
decorate(text) {
text = text.trim();
if (text.includes("\n")) {
this.prefix = `<${this.name}>`;
this.suffix = `</${this.name}>`;
}
return super.decorate(text);
}
};
}
static replace(name, text) {
return class extends Tag {
constructor() {
super(name, "", "");
this.text = text;
}
toMarkdown() {
return this.text;
}
};
}
static link() {
return class extends Tag {
constructor() {
super("a");
}
decorate(text) {
const attr = this.element.attributes;
if (attr && attr.href && text !== attr.href) {
return "[" + text + "](" + attr.href + ")";
}
return text;
}
};
}
static image() {
return class extends Tag {
constructor() {
super("img");
}
toMarkdown() {
const e = this.element;
const attr = e.attributes;
const pAttr = e.parent && e.parent.attributes;
const src = (attr && attr.src) || (pAttr && pAttr.src);
if (src) {
const alt = (attr && attr.alt) || (pAttr && pAttr.alt) || "";
return "![" + alt + "](" + src + ")";
}
return "";
}
};
}
static slice(name, prefix, suffix) {
return class extends Tag {
constructor() {
super(name, prefix, suffix);
}
decorate(text) {
if (!this.element.next) {
this.suffix = "";
}
return `${text}${this.suffix}`;
}
};
}
static cell(name) {
return Tag.slice(name, "", " ");
}
static li() {
return class extends Tag.slice("li", "", "\n") {
decorate(text) {
const indent = this.element.filterParentNames("ul").slice(1).map(() => " ").join("");
return super.decorate(`${indent}* ${trimLeft(text)}`);
}
};
}
}
const tags = [
...Tag.blocks().map((b) => Tag.block(b)),
...Tag.headings().map((h, i) => Tag.heading(h, i + 1)),
...Tag.slices().map((s) => Tag.slice(s, "", "\n")),
...Tag.emphases().map((e) => Tag.emphasis(e[0], e[1])),
Tag.cell("td"), Tag.cell("th"),
Tag.replace("br", "\n"), Tag.replace("hr", "\n---\n"), Tag.replace("head", ""),
Tag.li(), Tag.link(), Tag.image(),
// TO-DO CREATE: code, tbody, ins, del, blockquote, small, large
// UPDATE: ol, pre, thead, th, td
];
class Element {
constructor(element, parent, previous, next) {
this.name = element.name;
this.type = element.type;
this.data = element.data;
this.children = element.children;
this.attributes = element.attributes;
if (parent) {
this.parent = parent;
this.parentNames = (parent.parentNames || []).slice();
this.parentNames.push(parent.name);
}
this.previous = previous;
this.next = next;
}
tag() {
const tag = new (tags.filter(t => (new t().name === this.name))[0] || Tag)();
tag.element = this;
return tag;
}
innerMarkdown() {
return Element.parseChildren(this);
}
leftTrimmable() {
return this.previous && Tag.trimmable().includes(this.previous.name);
}
rightTrimmable() {
return this.next && Tag.trimmable().includes(this.next.name);
}
text() {
let text = this.data || "";
if (this.leftTrimmable()) {
text = trimLeft(text);
}
if (this.rightTrimmable()) {
text = trimRight(text);
}
text = text.replace(/[ \t]+/g, " ");
return text;
}
toMarkdown() {
switch(this.type) {
case "text":
return this.text();
break;
case "tag":
return this.tag().toMarkdown();
break;
}
}
filterParentNames(name) {
return this.parentNames.filter(p => p === name);
}
static toMarkdown(element, parent, prev, next) {
return new Element(element, parent, prev, next).toMarkdown();
}
static parseChildren(parent) {
return Element.parse(parent.children, parent);
}
static parse(elements, parent = null) {
if (elements) {
let result = [];
for (let i = 0; i < elements.length; i++) {
const prev = (i === 0) ? null : elements[i-1];
const next = (i === elements.length) ? null : elements[i+1];
result.push(Element.toMarkdown(elements[i], parent, prev, next));
}
return result.join("");
}
return "";
}
}
export default function toMarkdown(html) {
try {
let markdown = Element.parse(parseHTML(html)).trim();
markdown = markdown.replace(/^<b>/, "").replace(/<\/b>$/, "").trim(); // fix for google doc copy paste
return markdown.replace(/\r/g, "").replace(/\n \n/g, "\n\n").replace(/\n{3,}/g, "\n\n");
} catch(err) {
return "";
}
}

View File

@ -37,3 +37,4 @@
//= require virtual-dom
//= require virtual-dom-amd
//= require highlight.js
//= require htmlparser.js

View File

@ -2,7 +2,7 @@ import createStore from 'helpers/create-store';
QUnit.module("lib:category-link");
import parseHTML from 'helpers/parse-html';
import parseHTML from 'discourse/helpers/parse-html';
import { categoryBadgeHTML } from "discourse/helpers/category-link";
QUnit.test("categoryBadge without a category", assert => {
@ -44,4 +44,4 @@ QUnit.test("allowUncategorized", assert => {
assert.blank(categoryBadgeHTML(uncategorized), "it doesn't return HTML for uncategorized by default");
assert.present(categoryBadgeHTML(uncategorized, {allowUncategorized: true}), "it returns HTML");
});
});

View File

@ -0,0 +1,126 @@
import toMarkdown from 'discourse/lib/to-markdown';
QUnit.module("lib:to-markdown");
QUnit.test("converts styles between normal words", assert => {
const html = `Line with <s>styles</s> <b><i>between</i></b> words.`;
const markdown = `Line with ~~styles~~ **_between_** words.`;
assert.equal(toMarkdown(html), markdown);
});
QUnit.test("converts inline nested styles", assert => {
let html = `<em>Italicised line with <strong>some random</strong> <b>bold</b> words.</em>`;
let markdown = `_Italicised line with **some random** **bold** words._`;
assert.equal(toMarkdown(html), markdown);
html = `<i class="fa">Italicised line
with <b title="strong">some
random</b> <s>bold</s> words.</i>`;
markdown = `<i>Italicised line\n with <b>some\n random</b> ~~bold~~ words.</i>`;
assert.equal(toMarkdown(html), markdown);
});
QUnit.test("converts a link", assert => {
const html = `<a href="https://discourse.org">Discourse</a>`;
const markdown = `[Discourse](https://discourse.org)`;
assert.equal(toMarkdown(html), markdown);
});
QUnit.test("put raw URL instead of converting the link", assert => {
let url = "https://discourse.org";
const html = () => `<a href="${url}">${url}</a>`;
assert.equal(toMarkdown(html()), url);
url = "discourse.org/t/topic-slug/1";
assert.equal(toMarkdown(html()), url);
});
QUnit.test("skip empty link", assert => {
assert.equal(toMarkdown(`<a href="https://example.com"></a>`), "");
});
QUnit.test("converts heading tags", assert => {
const html = `
<h1>Heading 1</h1>
<h2>Heading 2</h2>
\t <h3>Heading 3</h3>
<h4>Heading 4</h4>
<h5>Heading 5</h5>
<h6>Heading 6</h6>
`;
const markdown = `# Heading 1\n\n## Heading 2\n\n### Heading 3\n\n#### Heading 4\n\n##### Heading 5\n\n###### Heading 6`;
assert.equal(toMarkdown(html), markdown);
});
QUnit.test("converts ul and ol list tags", assert => {
const html = `
<ul>
<li>Item 1</li>
<li>
Item 2
<ul>
<li>Sub Item 1</li>
<li>Sub Item 2</li>
<ul><li>Sub <i>Sub</i> Item 1</li><li>Sub <b>Sub</b> Item 2</li></ul>
</ul>
</li>
<li>Item 3</li>
</ul>
`;
const markdown = `* Item 1\n* Item 2\n\n * Sub Item 1\n * Sub Item 2\n\n * Sub _Sub_ Item 1\n * Sub **Sub** Item 2\n\n* Item 3`;
assert.equal(toMarkdown(html), markdown);
});
QUnit.test("stripes unwanted inline tags", assert => {
const html = `
<p>Lorem ipsum <span>dolor sit amet, consectetur</span> <strike>elit.</strike></p>
<p>Ut minim veniam, <label>quis nostrud</label> laboris <nisi> ut aliquip ex ea</nisi> commodo.</p>
`;
const markdown = `Lorem ipsum dolor sit amet, consectetur ~~elit.~~\n\nUt minim veniam, quis nostrud laboris ut aliquip ex ea commodo.`;
assert.equal(toMarkdown(html), markdown);
});
QUnit.test("converts table as readable", assert => {
const html = `<address>Discourse Avenue</address><b>laboris</b>
<table>
<thead> <tr><th>Heading 1</th><th>Head 2</th></tr> </thead>
<tbody>
<tr><td>Lorem</td><td>ipsum</td></tr>
<tr><td><b>dolor</b></td> <td><i>sit amet</i></td></tr></tbody>
</table>
`;
const markdown = `Discourse Avenue\n\n**laboris**\n\nHeading 1 Head 2\n\nLorem ipsum\n**dolor** _sit amet_`;
assert.equal(toMarkdown(html), markdown);
});
QUnit.test("converts img tag", assert => {
const url = "https://example.com/image.png";
let html = `<img src="${url}">`;
assert.equal(toMarkdown(html), `![](${url})`);
html = `<div><span><img src="${url}" alt="description" /></span></div>`;
assert.equal(toMarkdown(html), `![description](${url})`);
html = `<a href="http://example.com"><img src="${url}" alt="description" /></a>`;
assert.equal(toMarkdown(html), `[![description](${url})](http://example.com)`);
html = `<a href="http://example.com">description <img src="${url}" /></a>`;
assert.equal(toMarkdown(html), `[description ![](${url})](http://example.com)`);
html = `<img alt="description" />`;
assert.equal(toMarkdown(html), "");
html = `<a><img src="${url}" alt="description" /></a>`;
assert.equal(toMarkdown(html), `![description](${url})`);
});