HTML Document class
HTML-Dokument Klasse
This class is a wrapper for the PHP DOM/XPATH functions. Purpose is to perform often required tasks, such as removing comments, css, scripts, getting links, etc. Take a look at the defined methods - they do what they're named.
Diese Klasse ist ein Wrapper für die PHP DOM/XPATH Funktionen. Zweck ist oft benötigte Aufgaben (wie Kommentare/CSS/JS entfernen, Links herausfiltern usw.) in einem Methodenaufruf zu erledigen. Einfach einen Blick auf die Methoden werden - sie sind nach ihrem Zweck benannt.
Class source code
Klassen-Quelltext
<?php
/**
* HTML Document manupulation and parsing
*
* @gpackage de.atwillys.sw.php.swlib.util
* @author Stefan Wilhelm
* @copyright Stefan Wilhelm, 2010
* @license GPL
* @version 1.0
*/
namespace sw;
class HtmlDocument {
/**
* Internal DOM document instance
*
* @var \DOMDocument
*/
private $dom = null;
/**
* Internal XML path object
*
* @var \DOMXPath
*/
private $xpath = null;
/**
* Constructor, optionally the HTML content can be set here.
* @param string $htmlText
*/
public function __construct($htmlText="") {
$this->dom = new \DOMDocument();
$this->xpath = new \DOMXPath($this->dom);
if(!empty($htmlText)) {
$this->setHtml($htmlText);
}
}
/**
* Returns the HTML text of the object
* @return string
*/
public function __toString() {
return $this->getHtml();
}
/**
* Sets a new HTML text to work with.
* @param string $html
* @return \sw\HtmlDocument
*/
public function setHtml($html) {
$this->dom->loadHTML(trim($html, "\n\r\t "));
$this->xpath = new \DOMXPath($this->dom);
return $this;
}
/**
* Returns the HTML text contained in the object
* @return string
*/
public function getHtml() {
return trim($this->dom->saveHTML(), "\n\r\t ");
}
/**
*
*/
public function getLinks() {
$links = array();
$nodes = $this->dom->getElementsByTagName('a');
foreach($nodes as $node) {
if($node->hasAttributes() && !empty($node->attributes) && $node->attributes->getNamedItem('href')) {
$l = $node->attributes->getNamedItem('href')->nodeValue;
if(strlen($l) > 0) {
if($node->hasChildNodes()) {
$t = "";
foreach($this->xpath->query('.//text()', $node) as $sn) {
$t .= $sn->textContent . " ";
}
$links[$l] = trim($t);
} else {
$links[$l] = '';
}
}
}
}
return $links;
}
/**
* Removes <script> ... </script> and <noscript>...</noscript> tags
* @return \sw\HtmlDocument
*/
public function removeScriptBlocks() {
return $this->removeTagsWithSubnodes(array('script','noscript'));
}
/**
* Removes <style> ... </style> and <style ... /> tags
* @return \sw\HtmlDocument
*/
public function removeStyleBlocks() {
return $this->removeTagsWithSubnodes(array('style'));
}
/**
* Removes comments
* @return \sw\HtmlDocument
*/
public function removeComments() {
return $this->removeByXPathQuery('//comment()');
}
/**
* Replaces   with " ".
* @return \sw\HtmlDocument
*/
public function nbspToSpace() {
$this->setHtml(str_ireplace(" ", " ", $this->getHtml()));
return $this;
}
/**
* Removes everything that is not in the <body>. If no <html> or <body> tag
* is present the whole text is interpreted to be the body (simply does
* nothing).
* @return \sw\HtmlDocument
*/
public function bodyOnly() {
if(count($this->dom->getElementsByTagName('body')) == 0 || count($this->dom->getElementsByTagName('html')) == 0) {
return $this;
}
$html = $this->dom->getElementsByTagName('html')->item(0);
for($i=$html->childNodes->length-1; $i>=0; $i--) {
if(strtolower($html->childNodes->item($i)->nodeName) != "body") {
$html->removeChild($html->childNodes->item($i));
}
}
return $this;
}
/**
* Removes nodes matching a specified xpath
* @param string $query
* @return \sw\HtmlDocument
*/
public function removeByXPathQuery($query) {
$comments = $this->xpath->query($query);
for($i = $comments->length-1; $i>=0; $i--) {
$node = $comments->item($i);
if($node->parentNode) {
$node->parentNode->removeChild($node);
}
}
return $this;
}
/**
* Removes all attributes except the specified exceptions (attribute names).
* Affects all nodes that have attributes.
* @return \sw\HtmlDocument
*/
public function removeAttributes(array $exceptions=array('id', 'name', 'class', 'href', 'rel', 'src', 'colspan', 'rowspan')) {
$exceptions = array_change_key_case(array_combine($exceptions, $exceptions), CASE_LOWER);
$nodes = $this->xpath->query('//node()');
foreach($nodes as $n) {
if($n->hasAttributes() && !is_null($n->attributes)){
$rm = array();
foreach($n->attributes as $i => $a) {
if(!isset($exceptions[strtolower($a->name)])) {
$rm[] = $a->name;
}
}
while(!empty($rm)) {
try {
$n->removeAttribute(array_pop($rm));
} catch(\Exception $e) {
}
}
}
}
return $this;
}
/**
* Removes tags without removing the contents or subnodes.
* @param array $tags
*/
public function removeTags(array $tags=array()) {
$html = $this->getHtml();
foreach($tags as $tag) {
$tag = preg_replace('/[\W]/i', '', $tag);
$html = preg_replace('/[\s]*<[\s]*\/?[\s]*'. $tag . '[\s]*\/?[\s]*>/i', '', $html);
}
$this->setHtml($html);
return $this;
}
/**
* Removes tags by name including all subnodes
* @return \sw\HtmlDocument
*/
public function removeTagsWithSubnodes(array $tags=array()) {
foreach($tags as $tag) {
$nodes = $this->dom->getElementsByTagName($tag);
$n = array();
foreach($nodes as $node) $n[] = $node;
while(count($n) > 0) {
try {
$node = array_pop($n);
if($node->parentNode) $node->parentNode->removeChild($node);
} catch(\Exception $e) {
}
}
}
return $this;
}
}