Current Path : /var/www/www-root/data/www/monolith-realty.ru/bitrix/modules/main/lib/urlpreview/ |
Current File : /var/www/www-root/data/www/monolith-realty.ru/bitrix/modules/main/lib/urlpreview/htmldocument.php |
<?php namespace Bitrix\Main\UrlPreview; use Bitrix\Main\Context; use Bitrix\Main\Text\Encoding; use Bitrix\Main\Type\DateTime; use Bitrix\Main\Web\HttpClient; use Bitrix\Main\Web\Uri; use Bitrix\Main\Web\MimeType; class HtmlDocument { const MAX_IMAGES = 4; const MAX_IMAGE_URL_LENGTH = 2000; const MAX_HTML_LENGTH = 1048576; // 1 MB /** @var Uri */ protected $uri; /** @var string */ protected $html; /** @var string */ protected $htmlEncoding; /** @var array * Allowed keys so far: TITLE, DESCRIPTION, IMAGE */ protected $metadata = array( "TITLE" => null, "DESCRIPTION" => null, "IMAGE" => null, "EMBED" => null, "DATE_EXPIRE" => null, ); /** @var array */ protected $metaElements = array(); /** @var array */ protected $linkElements = array(); /** * HtmlDocument constructor. * * @param string $html Document HTML code. * @param Uri $uri Document's URL. */ public function __construct($html, Uri $uri) { $this->html = substr($html, 0, self::MAX_HTML_LENGTH); $this->uri = $uri; } /** * Returns Uri of the document * * @return Uri */ public function getUri() { return $this->uri; } /** * Returns full html code of the document * * @return string */ public function getHtml() { return $this->html; } /** * Returns true if metadata is complete * * @return bool */ public function checkMetadata() { $result = ( $this->metadata['TITLE'] != '' && $this->metadata['DESCRIPTION'] != '' && $this->metadata['IMAGE'] != ''); if ($this->isEmbeddingAllowed()) { $result = $result && $this->metadata['EMBED'] != ''; } return $result; } /** * Returns metadata, extracted from the page. Should return an array with required key TITLE * and optional keys DESCRIPTION and URL * * @return array|false */ public function getMetadata() { return $this->metadata; } /** * Returns document's TITLE metadata * * @return string */ public function getTitle() { return $this->metadata['TITLE']; } /** * Sets document's TITLE metadata * * @param string $title Title. * @return void */ public function setTitle($title) { if ($title <> '') { $this->metadata['TITLE'] = $this->filterString($title); } } /** * @return string */ public function getDescription() { return $this->metadata['DESCRIPTION']; } /** * Sets document's DESCRIPTION metadata * * @param string $description Description. * @return void */ public function setDescription($description) { if ($description <> '') { $this->metadata['DESCRIPTION'] = $this->filterString($description); } } /** * @return string Main image's url. */ public function getImage() { return $this->metadata['IMAGE']; } /** * Sets document's IMAGE metadata * * @param string $image Main image's url. * @return void */ public function setImage($image) { if ($image <> '') { $imageUrl = $this->normalizeImageUrl($image); if (!is_null($imageUrl) && $this->validateImage($imageUrl, true)) { $this->metadata['IMAGE'] = $imageUrl; } } } /** * @return string HTML code to embed url to the page. */ public function getEmdbed() { return $this->metadata['EMBED']; } /** * Sets document's EMBED metadata, if site is allowed to be embedded. * * @param string $embed HTML code for embedding object to the page. * @return void */ public function setEmbed($embed) { if ($this->isEmbeddingAllowed()) { $this->metadata['EMBED'] = $embed; } } /** * Sets additional metadata field. * @param string $fieldName Name of the field. Expected values: * <li>FAVICON: $fieldValue must contain the url of document's favicon * <li>IMAGES: $fieldValue must be the array of urls of images, detected in the document * <li>In other cases, $fieldValue must contain plain text. * @param string $fieldValue Field value. * @return void */ public function setExtraField($fieldName, $fieldValue) { if ($fieldName == 'FAVICON') { $this->metadata['EXTRA'][$fieldName] = $this->convertRelativeUriToAbsolute($fieldValue); } elseif ($fieldName == 'IMAGES') { if (is_array($fieldValue)) { $this->metadata['EXTRA']['IMAGES'] = array(); foreach($fieldValue as $image) { $image = $this->normalizeImageUrl($image); if ($image) { $this->metadata['EXTRA']['IMAGES'][] = $image; } if (count($this->metadata['EXTRA']['IMAGES']) >= self::MAX_IMAGES) { break; } } } } else { $this->metadata['EXTRA'][$fieldName] = $this->filterString($fieldValue); } } /** * Returns value of the additional metadata field * @param string $fieldName Name of the field. * @return string|null Value of the additional metadata field. */ public function getExtraField($fieldName) { return $this->metadata['EXTRA'][$fieldName] ?? null; } /** * Sets Expire date for the metadata. * * @param DateTime $dateExpire */ public function setDateExpire(DateTime $dateExpire) { if (!isset($this->metadata['DATE_EXPIRE']) || $this->metadata['DATE_EXPIRE']->getTimestamp() > $dateExpire->getTimestamp()) { $this->metadata['DATE_EXPIRE'] = $dateExpire; } } /** * Returns expire date for the metadata. * * @return DateTime|null */ public function getDateExpire(): ?DateTime { return $this->metadata['DATE_EXPIRE']; } /** * Set HTML document encoding * * @param string $encoding Document's encoding. * @return void */ public function setEncoding($encoding) { $encoding = trim($encoding, " \t\n\r\0\x0B'\""); $this->htmlEncoding = $encoding; } /** * @return string Document encoding. */ public function getEncoding() { if ($this->htmlEncoding <> '') { return $this->htmlEncoding; } $this->htmlEncoding = $this->detectEncoding(); return $this->htmlEncoding; } /** * Auto-detect and set HTML document encoding * * @return string Detected encoding. */ public function detectEncoding() { $result = ''; if (empty($this->metaElements)) { $this->metaElements = $this->extractElementAttributes('meta'); } foreach($this->metaElements as $metaElement) { if (isset($metaElement['http-equiv']) && mb_strtolower($metaElement['http-equiv']) == 'content-type') { if (preg_match('/charset=([\w\-]+)/', $metaElement['content'], $matches)) { $result = $matches[1]; break; } } elseif (isset($metaElement['charset'])) { $result = $metaElement['charset']; break; } } return $result; } /** * Parses html content for attributes of the specified elements and fills $destination array with found attributes * * @param string $tagName Name of the tag. * @return array */ public function extractElementAttributes($tagName) { $results = array(); preg_match_all("/<$tagName.+?>/mis", $this->html, $elements); foreach($elements[0] as $element) { preg_match_all('/(?:([\w\-_]+)=([\'"])(.*?)\g{-2}\s*)/mis', $element, $matches); $elementAttributes = array(); foreach($matches[1] as $k => $attributeName) { $attributeName = mb_strtolower($attributeName); $attributeValue = $matches[3][$k]; $elementAttributes[$attributeName] = $attributeValue; } $results[] = $elementAttributes; } return $results; } /** * Returns value of the content attribute * * @param string $name Value of a name or property attribute. * @return string * */ public function getMetaContent($name) { if (empty($this->metaElements)) { $this->metaElements = $this->extractElementAttributes('meta'); } $name = mb_strtolower($name); foreach ($this->metaElements as $metaElement) { if ((isset($metaElement['name']) && mb_strtolower($metaElement['name']) === $name || isset($metaElement['property']) && mb_strtolower($metaElement['property']) === $name) && $metaElement['content'] <> '') { return $metaElement['content']; } } return null; } /** * Returns value of the href attribute. * * @param string $rel Value of the rel attribute. * @return string */ public function getLinkHref($rel) { if (empty($this->linkElements)) { $this->linkElements = $this->extractElementAttributes('link'); } $rel = mb_strtolower($rel); foreach ($this->linkElements as $linkElement) { if (isset($linkElement['rel']) && mb_strtolower($linkElement['rel']) == $rel && $linkElement['href'] <> '') { return $linkElement['href']; } } return null; } /** * Sanitizes string and converts it to the site's charset. * * @param string $str Input string. * @return string */ protected function filterString($str) { $str = html_entity_decode($str, ENT_QUOTES, $this->getEncoding()); $str = Encoding::convertEncoding($str, $this->getEncoding(), Context::getCurrent()->getCulture()->getCharset()); $str = trim($str); $str = strip_tags($str); return $str; } /** * Converts relative url to the absolute, considering document's url. * @param string $uri Relative url. * @return null|string Absolute url or null if relative url contains errors. */ protected function convertRelativeUriToAbsolute($uri) { if (strpos($uri, '//') === 0) { $uri = $this->uri->getScheme().":".$uri; } if (preg_match('#^https?://#', $uri)) { return $uri; } $pars = parse_url($uri); if ($pars === false) { return null; } if (isset($pars['host'])) { $result = $uri; } elseif (isset($pars['path'])) { if (mb_substr($pars['path'], 0, 1) !== '/') { $pathPrefix = preg_replace('/^(.+?)([^\/]*)$/', '$1', $this->uri->getPath()); $pars['path'] = $pathPrefix.$pars['path']; } $uriPort = ''; if ($this->uri->getScheme() === 'http' && $this->uri->getPort() != '80' || $this->uri->getScheme() === 'https' && $this->uri->getPort() != '443') { $uriPort = ':'.$this->uri->getPort(); } $result = $this->uri->getScheme().'://' .$this->uri->getHost() .$uriPort .$pars['path'] .(isset($pars['query']) ? '?'.$pars['query'] : '') .(isset($pars['fragment']) ? '#'.$pars['fragment'] : ''); } else { $result = null; } return $result; } /** * Transforms image's URL from relative to absolute and checks length of the resulting URL. * @param string $url Image's URL. * @return string|null Absolute image's URL, or null if URL is incorrect or too long. */ protected function normalizeImageUrl($url): ?string { $url = $this->convertRelativeUriToAbsolute($url); if (mb_strlen($url) > self::MAX_IMAGE_URL_LENGTH) { $url = null; } return $url; } /** * Validates mime-type of the image * @param string $url Absolute image's URL. * @return bool */ protected function validateImage($url, $skipForPrivateIp = false) { $httpClient = new HttpClient(); $httpClient->setTimeout(5); $httpClient->setStreamTimeout(5); $httpClient->setPrivateIp(false); $httpClient->setHeader('User-Agent', UrlPreview::USER_AGENT); if (!$httpClient->query('HEAD', $url)) { $errorCode = array_key_first($httpClient->getError()); return ($skipForPrivateIp && $errorCode === 'PRIVATE_IP'); } if ($httpClient->getStatus() !== 200) { return false; } $contentType = $httpClient->getHeaders()->getContentType(); return MimeType::isImage($contentType); } /** * Returns true if document's site is allowed to be embedded. * @return bool */ protected function isEmbeddingAllowed() { return UrlPreview::isHostTrusted($this->uri); } }