Plain text: Created a new HTML to plain text converter

To centralise logic to be more consistent, and to have smarter logic
which avoids just following newline format from input, preventing
smushing HTML elements (like list elements) next to eachother
This commit is contained in:
Dan Brown
2026-04-05 00:05:10 +01:00
parent 25790fd024
commit c7d3775bb9
7 changed files with 125 additions and 5 deletions

View File

@@ -9,6 +9,7 @@ use BookStack\Users\Models\HasCreatorAndUpdater;
use BookStack\Users\Models\OwnableInterface;
use BookStack\Util\HtmlContentFilter;
use BookStack\Util\HtmlContentFilterConfig;
use BookStack\Util\HtmlToPlainText;
use Illuminate\Database\Eloquent\Builder;
use Illuminate\Database\Eloquent\Factories\HasFactory;
use Illuminate\Database\Eloquent\Relations\BelongsTo;
@@ -87,6 +88,12 @@ class Comment extends Model implements Loggable, OwnableInterface
return $filter->filterString($this->html ?? '');
}
public function getPlainText(): string
{
$converter = new HtmlToPlainText();
return $converter->convert($this->html ?? '');
}
public function jointPermissions(): HasMany
{
return $this->hasMany(JointPermission::class, 'entity_id', 'commentable_id')

View File

@@ -24,7 +24,7 @@ class CommentCreationNotification extends BaseActivityNotification
$locale->trans('notifications.detail_page_name') => new EntityLinkMessageLine($page),
$locale->trans('notifications.detail_page_path') => $this->buildPagePathLine($page, $notifiable),
$locale->trans('notifications.detail_commenter') => $this->user->name,
$locale->trans('notifications.detail_comment') => strip_tags($comment->html),
$locale->trans('notifications.detail_comment') => $comment->getPlainText(),
]);
return $this->newMailMessage($locale)

View File

@@ -24,7 +24,7 @@ class CommentMentionNotification extends BaseActivityNotification
$locale->trans('notifications.detail_page_name') => new EntityLinkMessageLine($page),
$locale->trans('notifications.detail_page_path') => $this->buildPagePathLine($page, $notifiable),
$locale->trans('notifications.detail_commenter') => $this->user->name,
$locale->trans('notifications.detail_comment') => strip_tags($comment->html),
$locale->trans('notifications.detail_comment') => $comment->getPlainText(),
]);
return $this->newMailMessage($locale)

View File

@@ -16,6 +16,7 @@ use BookStack\References\ReferenceUpdater;
use BookStack\Sorting\BookSorter;
use BookStack\Uploads\ImageRepo;
use BookStack\Util\HtmlDescriptionFilter;
use BookStack\Util\HtmlToPlainText;
use Illuminate\Http\UploadedFile;
class BaseRepo
@@ -151,9 +152,10 @@ class BaseRepo
}
if (isset($input['description_html'])) {
$plainTextConverter = new HtmlToPlainText();
$entity->descriptionInfo()->set(
HtmlDescriptionFilter::filterFromString($input['description_html']),
html_entity_decode(strip_tags($input['description_html']))
$plainTextConverter->convert($input['description_html']),
);
} else if (isset($input['description'])) {
$entity->descriptionInfo()->set('', $input['description']);

View File

@@ -16,6 +16,7 @@ use BookStack\Users\Models\User;
use BookStack\Util\HtmlContentFilter;
use BookStack\Util\HtmlContentFilterConfig;
use BookStack\Util\HtmlDocument;
use BookStack\Util\HtmlToPlainText;
use BookStack\Util\WebSafeMimeSniffer;
use Closure;
use DOMElement;
@@ -303,8 +304,8 @@ class PageContent
public function toPlainText(): string
{
$html = $this->render(true);
return html_entity_decode(strip_tags($html));
$converter = new HtmlToPlainText();
return $converter->convert($html);
}
/**

View File

@@ -0,0 +1,47 @@
<?php
namespace BookStack\Util;
class HtmlToPlainText
{
/**
* Inline tags types where the content should not be put on a new line.
*/
protected array $inlineTags = [
'a', 'b', 'i', 'u', 'strong', 'em', 'small', 'sup', 'sub', 'span', 'div',
];
/**
* Convert the provided HTML to relatively clean plain text.
*/
public function convert(string $html): string
{
$doc = new HtmlDocument($html);
$text = $this->nodeToText($doc->getBody());
// Remove repeated newlines
$text = preg_replace('/\n+/', "\n", $text);
// Remove leading/trailing whitespace
$text = trim($text);
return $text;
}
protected function nodeToText(\DOMNode $node): string
{
if ($node->nodeType === XML_TEXT_NODE) {
return $node->textContent;
}
$text = '';
if (!in_array($node->nodeName, $this->inlineTags)) {
$text .= "\n";
}
foreach ($node->childNodes as $childNode) {
$text .= $this->nodeToText($childNode);
}
return $text;
}
}

View File

@@ -0,0 +1,63 @@
<?php
namespace Tests\Util;
use BookStack\Util\HtmlToPlainText;
use Tests\TestCase;
class HtmlToPlainTextTest extends TestCase
{
public function test_it_converts_html_to_plain_text()
{
$html = <<<HTML
<p>This is a test</p>
<ul>
<li>Item 1</li>
<li>Item 2</li>
</ul>
<h2>A Header</h2>
<p>more &lt;&copy;&gt; text <strong>with bold</strong></p>
HTML;
$expected = <<<TEXT
This is a test
Item 1
Item 2
A Header
more <©> text with bold
TEXT;
$this->runTest($html, $expected);
}
public function test_adjacent_list_items_are_separated_by_newline()
{
$html = <<<HTML
<ul><li>Item A</li><li>Item B</li></ul>
HTML;
$expected = <<<TEXT
Item A
Item B
TEXT;
$this->runTest($html, $expected);
}
public function test_inline_formats_dont_cause_newlines()
{
$html = <<<HTML
<p><strong>H</strong><a>e</a><sup>l</sup><span>l</span><em>o</em></p>
HTML;
$expected = <<<TEXT
Hello
TEXT;
$this->runTest($html, $expected);
}
protected function runTest(string $html, string $expected): void
{
$converter = new HtmlToPlainText();
$result = $converter->convert(trim($html));
$this->assertEquals(trim($expected), $result);
}
}