Files
BookStack/app/Util/HtmlToPlainText.php
Dan Brown c7d3775bb9 Plain text: Created a new HTML to plain text converter
To centralise logic to be more consistent, and to have smarter logic
which avoids just following newline format from input, preventing
smushing HTML elements (like list elements) next to eachother
2026-04-05 00:05:10 +01:00

48 lines
1.1 KiB
PHP

<?php
namespace BookStack\Util;
class HtmlToPlainText
{
/**
* Inline tags types where the content should not be put on a new line.
*/
protected array $inlineTags = [
'a', 'b', 'i', 'u', 'strong', 'em', 'small', 'sup', 'sub', 'span', 'div',
];
/**
* Convert the provided HTML to relatively clean plain text.
*/
public function convert(string $html): string
{
$doc = new HtmlDocument($html);
$text = $this->nodeToText($doc->getBody());
// Remove repeated newlines
$text = preg_replace('/\n+/', "\n", $text);
// Remove leading/trailing whitespace
$text = trim($text);
return $text;
}
protected function nodeToText(\DOMNode $node): string
{
if ($node->nodeType === XML_TEXT_NODE) {
return $node->textContent;
}
$text = '';
if (!in_array($node->nodeName, $this->inlineTags)) {
$text .= "\n";
}
foreach ($node->childNodes as $childNode) {
$text .= $this->nodeToText($childNode);
}
return $text;
}
}