Revamped the Html Truncator code to address issues with invalid HTML #1019

This commit is contained in:
Andy Miller 2016-09-06 15:38:36 -06:00
parent 10825d3f70
commit a54f30b8ae
6 changed files with 229 additions and 164 deletions

View File

@ -28,7 +28,8 @@
"ext-openssl": "*",
"ext-curl": "*",
"ext-zip": "*",
"league/climate": "^3.2"
"league/climate": "^3.2",
"antoligy/dom-string-iterators": "^1.0"
},
"require-dev": {
"codeception/codeception": "^2.1",

48
composer.lock generated
View File

@ -4,9 +4,53 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file",
"This file is @generated automatically"
],
"hash": "25e59d23a9af7f43dd9cd9d462057abd",
"content-hash": "22973a67f2eae64610e739fa82a3d60b",
"hash": "7a8caecbaedbf785d96b7437f296ca66",
"content-hash": "2fec25b3b5d627c0896d5ee3030b6bed",
"packages": [
{
"name": "antoligy/dom-string-iterators",
"version": "v1.0.0",
"source": {
"type": "git",
"url": "https://github.com/antoligy/dom-string-iterators.git",
"reference": "9a624b082493fee9b972840dbd677494edb94cf7"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/antoligy/dom-string-iterators/zipball/9a624b082493fee9b972840dbd677494edb94cf7",
"reference": "9a624b082493fee9b972840dbd677494edb94cf7",
"shasum": ""
},
"require": {
"php": ">=5.3.0"
},
"type": "library",
"autoload": {
"psr-4": {
"": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"Public Domain"
],
"authors": [
{
"name": "Alex Wilson",
"email": "a@ax.gy"
},
{
"name": "Kornel Lesinski",
"email": "pornel@pornel.net"
},
{
"name": "Patrick Galbraith",
"email": "patrick.j.galbraith@gmail.com"
}
],
"description": "Composer package for DOMWordsIterator and DOMLettersIterator",
"time": "2015-11-04 17:33:14"
},
{
"name": "doctrine/cache",
"version": "v1.6.0",

View File

@ -8,12 +8,16 @@
namespace Grav\Common\Helpers;
use DOMText;
use DOMDocument;
use DOMWordsIterator;
use DOMLettersIterator;
/**
* This file is part of urodoz/truncateHTML.
* This file is part of https://github.com/Bluetel-Solutions/twig-truncate-extension
*
* (c) Albert Lacarta <urodoz@gmail.com>
* Copyright (c) 2015 Bluetel Solutions developers@bluetel.co.uk
* Copyright (c) 2015 Alex Wilson ajw@bluetel.co.uk
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
@ -21,181 +25,188 @@ use DOMDocument;
class Truncator {
public static $default_options = array(
'ellipsis' => '…',
'break' => ' ',
'length_in_chars' => false,
'word_safe' => false,
);
/**
* Safely truncates HTML by a given number of words.
* @param string $html Input HTML.
* @param integer $limit Limit to how many words we preserve.
* @param string $ellipsis String to use as ellipsis (if any).
* @return string Safe truncated HTML.
*/
public static function truncateWords($html, $limit = 0, $ellipsis = "")
{
if ($limit <= 0) {
return $html;
}
// These tags are allowed to have an ellipsis inside
public static $ellipsable_tags = array(
'p', 'ol', 'ul', 'li',
'div', 'header', 'article', 'nav',
'section', 'footer', 'aside',
'dd', 'dt', 'dl',
);
$dom = self::htmlToDomDocument($html);
public static $self_closing_tags = array(
'br', 'hr', 'img',
);
// Grab the body of our DOM.
$body = $dom->getElementsByTagName("body")->item(0);
// Iterate over words.
$words = new DOMWordsIterator($body);
foreach ($words as $word) {
// If we have exceeded the limit, we delete the remainder of the content.
if ($words->key() >= $limit) {
// Grab current position.
$currentWordPosition = $words->currentWordPosition();
$curNode = $currentWordPosition[0];
$offset = $currentWordPosition[1];
$words = $currentWordPosition[2];
$curNode->nodeValue = substr(
$curNode->nodeValue,
0,
$words[$offset][1] + strlen($words[$offset][0])
);
self::removeProceedingNodes($curNode, $body);
if (!empty($ellipsis)) {
self::insertEllipsis($curNode, $ellipsis);
}
break;
}
}
return self::innerHTML($body);
}
/**
* Truncate given HTML string to specified length.
* If length_in_chars is false it's trimmed by number
* of words, otherwise by number of characters.
*
* @param string $html
* @param integer $length
* @param string|array $opts
* @return string
* Safely truncates HTML by a given number of letters.
* @param string $html Input HTML.
* @param integer $limit Limit to how many letters we preserve.
* @param string $ellipsis String to use as ellipsis (if any).
* @return string Safe truncated HTML.
*/
public static function truncate($html, $length, $opts=array())
public static function truncateLetters($html, $limit = 0, $ellipsis = "")
{
if (is_string($opts)) $opts = array('ellipsis' => $opts);
$opts = array_merge(static::$default_options, $opts);
// wrap the html in case it consists of adjacent nodes like <p>foo</p><p>bar</p>
$html = mb_convert_encoding("<div>".$html."</div>", 'HTML-ENTITIES', 'UTF-8');
$root_node = null;
// Parse using HTML5Lib if it's available.
if (class_exists('HTML5Lib\\Parser')) {
try {
$doc = \HTML5Lib\Parser::parse($html);
$root_node = $doc->documentElement->lastChild->lastChild;
}
catch (\Exception $e) {
;
}
if ($limit <= 0) {
return $html;
}
if ($root_node === null) {
// HTML5Lib not available so we'll have to use DOMDocument
// We'll only be able to parse HTML5 if it's valid XML
$doc = new DOMDocument('4.01', 'utf-8');
$doc->formatOutput = false;
$doc->preserveWhiteSpace = true;
// loadHTML will fail with HTML5 tags (article, nav, etc)
// so we need to suppress errors and if it fails to parse we
// retry with the XML parser instead
$prev_use_errors = libxml_use_internal_errors(true);
if ($doc->loadHTML($html)) {
$root_node = $doc->documentElement->lastChild->lastChild;
}
else if ($doc->loadXML($html)) {
$root_node = $doc->documentElement;
}
else {
libxml_use_internal_errors($prev_use_errors);
throw new \RuntimeException;
}
libxml_use_internal_errors($prev_use_errors);
}
list($text, $_, $opts) = static::truncateNode($doc, $root_node, $length, $opts);
$text = mb_substr(mb_substr($text, 0, -6), 5);
$dom = self::htmlToDomDocument($html);
return $text;
}
// Grab the body of our DOM.
$body = $dom->getElementsByTagName("body")->item(0);
protected static function truncateNode($doc, $node, $length, $opts)
{
if ($length === 0 && !static::ellipsable($node)) {
return array('', 1, $opts);
}
list($inner, $remaining, $opts) = static::innerTruncate($doc, $node, $length, $opts);
if (0 === mb_strlen($inner)) {
return array(in_array(mb_strtolower($node->nodeName), static::$self_closing_tags) ? $doc->saveXML($node) : "", $length - $remaining, $opts);
}
while($node->firstChild) {
$node->removeChild($node->firstChild);
}
$newNode = $doc->createDocumentFragment();
// handle the ampersand
$newNode->appendXml(static::xmlEscape($inner));
$node->appendChild($newNode);
return array($doc->saveXML($node), $length - $remaining, $opts);
}
// Iterate over letters.
$letters = new DOMLettersIterator($body);
foreach ($letters as $letter) {
protected static function innerTruncate($doc, $node, $length, $opts)
{
$inner = '';
$remaining = $length;
foreach($node->childNodes as $childNode) {
if ($childNode->nodeType === XML_ELEMENT_NODE) {
list($txt, $nb, $opts) = static::truncateNode($doc, $childNode, $remaining, $opts);
}
else if ($childNode->nodeType === XML_TEXT_NODE) {
list($txt, $nb, $opts) = static::truncateText($childNode, $remaining, $opts);
} else {
$txt = '';
$nb = 0;
}
// If we have exceeded the limit, we want to delete the remainder of this document.
if ($letters->key() >= $limit) {
// unhandle the ampersand
$txt = static::xmlUnescape($txt);
$currentText = $letters->currentTextPosition();
$currentText[0]->nodeValue = substr($currentText[0]->nodeValue, 0, $currentText[1] + 1);
self::removeProceedingNodes($currentText[0], $body);
$remaining -= $nb;
$inner .= $txt;
if ($remaining < 0) {
if (static::ellipsable($node)) {
$inner = preg_replace('/(?:[\s\pP]+|(?:&(?:[a-z]+|#[0-9]+);?))*$/u', '', $inner).$opts['ellipsis'];
$opts['ellipsis'] = '';
$opts['was_truncated'] = true;
if (!empty($ellipsis)) {
self::insertEllipsis($currentText[0], $ellipsis);
}
break;
}
}
return array($inner, $remaining, $opts);
return self::innerHTML($body);
}
protected static function truncateText($node, $length, $opts)
/**
* Builds a DOMDocument object from a string containing HTML.
* @param string HTML to load
* @returns DOMDocument Returns a DOMDocument object.
*/
public static function htmlToDomDocument($html)
{
$string = $node->textContent;
// Transform multibyte entities which otherwise display incorrectly.
$html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
if ($opts['length_in_chars']) {
$count = mb_strlen($string);
if ($count <= $length && $length > 0) {
return array($string, $count, $opts);
}
if ($opts['word_safe']) {
if (false !== ($breakpoint = mb_strpos($string, $opts['break'], $length))) {
if ($breakpoint < mb_strlen($string) - 1) {
$string = mb_substr($string, 0, $breakpoint) . $opts['break'];
}
// Internal errors enabled as HTML5 not fully supported.
libxml_use_internal_errors(true);
// Instantiate new DOMDocument object, and then load in UTF-8 HTML.
$dom = new DOMDocument();
$dom->encoding = 'UTF-8';
$dom->loadHTML($html);
return $dom;
}
/**
* Removes all nodes after the current node.
* @param DOMNode|DOMElement $domNode
* @param DOMNode|DOMElement $topNode
* @return void
*/
private static function removeProceedingNodes($domNode, $topNode)
{
$nextNode = $domNode->nextSibling;
if ($nextNode !== null) {
self::removeProceedingNodes($nextNode, $topNode);
$domNode->parentNode->removeChild($nextNode);
} else {
//scan upwards till we find a sibling
$curNode = $domNode->parentNode;
while ($curNode !== $topNode) {
if ($curNode->nextSibling !== null) {
$curNode = $curNode->nextSibling;
self::removeProceedingNodes($curNode, $topNode);
$curNode->parentNode->removeChild($curNode);
break;
}
return array($string, $count, $opts);
$curNode = $curNode->parentNode;
}
return array(mb_substr($node->textContent, 0, $length), $count, $opts);
}
else {
preg_match_all('/\s*\S+/', $string, $words);
$words = $words[0];
$count = count($words);
if ($count <= $length && $length > 0) {
return array($string, $count, $opts);
}
return array(implode('', array_slice($words, 0, $length)), $count, $opts);
}
}
protected static function ellipsable($node)
/**
* Inserts an ellipsis
* @param DOMNode|DOMElement $domNode Element to insert after.
* @param string $ellipsis Text used to suffix our document.
* @return void
*/
private static function insertEllipsis($domNode, $ellipsis)
{
return ($node instanceof DOMDocument)
|| in_array(mb_strtolower($node->nodeName), static::$ellipsable_tags)
;
$avoid = array('a', 'strong', 'em', 'h1', 'h2', 'h3', 'h4', 'h5'); //html tags to avoid appending the ellipsis to
if (in_array($domNode->parentNode->nodeName, $avoid) && $domNode->parentNode->parentNode !== null) {
// Append as text node to parent instead
$textNode = new DOMText($ellipsis);
if ($domNode->parentNode->parentNode->nextSibling) {
$domNode->parentNode->parentNode->insertBefore($textNode, $domNode->parentNode->parentNode->nextSibling);
} else {
$domNode->parentNode->parentNode->appendChild($textNode);
}
} else {
// Append to current node
$domNode->nodeValue = rtrim($domNode->nodeValue) . $ellipsis;
}
}
protected static function xmlEscape($string)
{
$string = str_replace('&', '&amp;', $string);
$string = str_replace('<?', '&lt;?', $string);
return $string;
/**
* Returns the innerHTML of a particular DOMElement
*
* @param $element
* @return string
*/
private static function innerHTML($element) {
$innerHTML = "";
$children = $element->childNodes;
foreach ($children as $child)
{
$tmp_dom = new DOMDocument();
$tmp_dom->appendChild($tmp_dom->importNode($child, true));
$innerHTML.=trim($tmp_dom->saveHTML());
}
return $innerHTML;
}
protected static function xmlUnescape($string)
{
$string = str_replace('&amp;', '&', $string);
$string = str_replace('&lt;?', '<?', $string);
return $string;
}
}

View File

@ -502,7 +502,8 @@ class Page
$size = 300;
}
return html_entity_decode(Utils::truncateHTML($content, $size));
$summary = Utils::truncateHTML($content, $size);
return html_entity_decode($summary);
}
/**

View File

@ -183,26 +183,28 @@ abstract class Utils
* Truncate HTML by number of characters. not "word-safe"!
*
* @param string $text
* @param int $length
* @param int $length in characters
* @param string $ellipsis
*
* @return string
*/
public static function truncateHtml($text, $length = 100)
public static function truncateHtml($text, $length = 100, $ellipsis = '...')
{
return Truncator::truncate($text, $length, ['length_in_chars' => true]);
return Truncator::truncateLetters($text, $length, $ellipsis);
}
/**
* Truncate HTML by number of characters in a "word-safe" manor.
*
* @param string $text
* @param int $length
* @param int $length in words
* @param string $ellipsis
*
* @return string
*/
public static function safeTruncateHtml($text, $length = 100)
public static function safeTruncateHtml($text, $length = 25, $ellipsis = '...')
{
return Truncator::truncate($text, $length, ['length_in_chars' => true, 'word_safe' => true]);
return Truncator::truncateWords($text, $length, $ellipsis);
}
/**

View File

@ -123,16 +123,22 @@ class UtilsTest extends \Codeception\TestCase\Test
public function testTruncateHtml()
{
$this->assertEquals('<p>T…</p>', Utils::truncateHtml('<p>This is a string to truncate</p>', 1));
$this->assertEquals('<p>This…</p>', Utils::truncateHtml('<p>This is a string to truncate</p>', 4));
$this->assertEquals('', Utils::truncateHtml('<input type="file" id="file" multiple />', 6, true));
$this->assertEquals('<p>T...</p>', Utils::truncateHtml('<p>This is a string to truncate</p>', 1));
$this->assertEquals('<p>This...</p>', Utils::truncateHtml('<p>This is a string to truncate</p>', 4));
$this->assertEquals('<p>This is a...</p>', Utils::truncateHtml('<p>This is a string to truncate</p>', 10));
$this->assertEquals('<p>This is a string to truncate</p>', Utils::truncateHtml('<p>This is a string to truncate</p>', 100));
$this->assertEquals('<input type="file" id="file" multiple>', Utils::truncateHtml('<input type="file" id="file" multiple />', 6));
$this->assertEquals('<ol><li>item 1 <i>so...</i></li></ol>', Utils::truncateHtml('<ol><li>item 1 <i>something</i></li><li>item 2 <strong>bold</strong></li></ol>', 10));
}
public function testSafeTruncateHtml()
{
$this->assertEquals('<p>This…</p>', Utils::safeTruncateHtml('<p>This is a string to truncate</p>', 1));
$this->assertEquals('<p>This…</p>', Utils::safeTruncateHtml('<p>This is a string to truncate</p>', 4));
$this->assertEquals('<p>This...</p>', Utils::safeTruncateHtml('<p>This is a string to truncate</p>', 1));
$this->assertEquals('<p>This is...</p>', Utils::safeTruncateHtml('<p>This is a string to truncate</p>', 2));
$this->assertEquals('<p>This is a string to...</p>', Utils::safeTruncateHtml('<p>This is a string to truncate</p>', 5));
$this->assertEquals('<p>This is a string to truncate</p>', Utils::safeTruncateHtml('<p>This is a string to truncate</p>', 20));
$this->assertEquals('<input type="file" id="file" multiple>', Utils::safeTruncateHtml('<input type="file" id="file" multiple />', 6));
$this->assertEquals('<ol><li>item 1 <i>something</i></li><li>item 2...</li></ol>', Utils::safeTruncateHtml('<ol><li>item 1 <i>something</i></li><li>item 2 <strong>bold</strong></li></ol>', 5));
}
public function testGenerateRandomString()