FIX: Improve HTML to Markdown conversion (#9231)

This commit ensures that whitespaces are preserved in <pre>, but removed
inside text paragraphs.
This commit is contained in:
Dan Ungureanu 2020-03-18 19:31:10 +02:00 committed by GitHub
parent 778454e26b
commit 1393950dbc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 16 additions and 3 deletions

View File

@ -31,7 +31,7 @@ class HtmlToMarkdown
def remove_whitespaces!
@doc.traverse do |node|
if node.is_a? Nokogiri::XML::Text
if node.is_a?(Nokogiri::XML::Text) && node.parent.name != "pre"
node.content = node.content.gsub(/\A[[:space:]]+/, "") if node.previous_element&.description&.block?
node.content = node.content.gsub(/\A[[:space:]]+/, "") if node.previous_element.nil? && node.parent.description&.block?
node.content = node.content.gsub(/[[:space:]]+\z/, "") if node.next_element&.description&.block?
@ -220,10 +220,21 @@ class HtmlToMarkdown
end
def visit_text(node)
top_block = @stack[-1]
if top_block.name == "pre"
top_block.markdown << node.text
return
end
node.content = node.content.gsub(/\A[[:space:]]+/, "") if node.previous_element.nil? && EMPHASIS.include?(node.parent.name)
indent = node.text[/^\s+/] || ""
if top_block.markdown.present? && indent = node.text[/^\s+/]
top_block.markdown << indent
end
text = node.text.gsub(/^\s+/, "").gsub(/\s{2,}/, " ")
@stack[-1].markdown << [indent, text].join("")
top_block.markdown << text
end
def format_block

View File

@ -204,6 +204,7 @@ describe HtmlToMarkdown do
expect(html_to_markdown("<pre>var foo = 'bar';</pre>")).to eq("```\nvar foo = 'bar';\n```")
expect(html_to_markdown("<pre><code>var foo = 'bar';</code></pre>")).to eq("```\nvar foo = 'bar';\n```")
expect(html_to_markdown(%Q{<pre><code class="lang-javascript">var foo = 'bar';</code></pre>})).to eq("```javascript\nvar foo = 'bar';\n```")
expect(html_to_markdown("<pre> function f() {\n console.log('Hello world!');\n }</pre>")).to eq("```\n function f() {\n console.log('Hello world!');\n }\n```")
end
it "supports <pre> inside <blockquote>" do
@ -220,6 +221,7 @@ describe HtmlToMarkdown do
it "handles <p>" do
expect(html_to_markdown("<p>1st paragraph</p><p>2nd paragraph</p>")).to eq("1st paragraph\n\n2nd paragraph")
expect(html_to_markdown("<body><p>1st paragraph</p>\n <p> 2nd paragraph\n 2nd paragraph</p>\n<p>3rd paragraph</p></body>")).to eq("1st paragraph\n\n2nd paragraph\n2nd paragraph\n\n3rd paragraph")
end
it "handles <div>" do