Current Path : /var/www/www-root/data/www/monolith-realty.ru/bitrix/modules/search/tools/ |
Current File : /var/www/www-root/data/www/monolith-realty.ru/bitrix/modules/search/tools/language.php |
<? class CSearchLanguage { var $_abc = array(); var $_lang_id; var $_lang_bigramm_cache; var $_trigrams = array(); var $_has_bigramm_info = null; var $_bigrams = null; function __construct($lang_id) { $this->_lang_id = $lang_id; } //Function loads language class static function GetLanguage($sLang) { static $arLanguages = array(); if(!isset($arLanguages[$sLang])) { $obLanguage = null; $class_name = mb_strtolower("CSearchLanguage".$sLang); if(!class_exists($class_name)) { //First try to load customized class $strDirName = $_SERVER["DOCUMENT_ROOT"].BX_PERSONAL_ROOT."/php_interface/".$sLang."/search"; $strFileName = $strDirName."/language.php"; if(file_exists($strFileName)) $obLanguage = @include($strFileName); if(!is_object($obLanguage)) { if(!class_exists($class_name)) { //Then module class $strDirName = $_SERVER["DOCUMENT_ROOT"]."/bitrix/modules/search/tools/".$sLang; $strFileName = $strDirName."/language.php"; if (file_exists($strFileName)) { if (\Bitrix\Main\Localization\Translation::allowConvertEncoding()) { \Bitrix\Main\Localization\StreamConverter::include($strFileName, $sLang); } else { @include($strFileName); } } if(!class_exists($class_name)) { $class_name = "CSearchLanguage"; } } } } if(!is_object($obLanguage)) $obLanguage = new $class_name($sLang); $obLanguage->LoadTrigrams($strDirName); $arStemInfo = stemming_init($sLang); if(is_array($arStemInfo)) $obLanguage->_abc = array_flip($obLanguage->StrToArray($arStemInfo["abc"])); $obLanguage->_has_bigramm_info = is_callable(array($obLanguage, "getbigrammletterfreq")); $arLanguages[$sLang] = $obLanguage; } return $arLanguages[$sLang]; } //Reads file with trigrams (combinations not allowed in the words) function LoadTrigrams($dir_name) { if(empty($this->_trigrams)) { $file_name = $dir_name."/trigram"; if(file_exists($file_name) && is_file($file_name)) { $cache_id = filemtime($file_name).",v1,".$file_name; $obCache = new CPHPCache; if($obCache->StartDataCache(360000, $cache_id, "search")) { $text = file_get_contents($file_name); $keyboard = $this->GetKeyboardLayout(); if (defined("BX_UTF") && isset($keyboard["trigram_charset"])) { $text = $GLOBALS["APPLICATION"]->ConvertCharset($text, $keyboard["trigram_charset"], "utf8"); } $ar = explode("\n", $text); foreach($ar as $trigramm) { if(mb_strlen($trigramm) == 3) { $strScanCodesTmp = $this->ConvertToScancode($trigramm, false, true); if(mb_strlen($strScanCodesTmp) == 3) { $this->_trigrams[$strScanCodesTmp] = true; } } } $obCache->EndDataCache($this->_trigrams); } else { $this->_trigrams = $obCache->GetVars(); } } } } function HasTrigrams() { return !empty($this->_trigrams); } //Check phrase against trigrams function CheckTrigrams($arScanCodes) { $result = 0; $check = ""; $len = 0; foreach($arScanCodes as $i => $code) { if($code === false) //new word starts here { $check = ""; $len = 0; } else { //running window of 3 bytes if($len < 3) { $check .= chr($code+1); $len++; } else { $check = $check[1].$check[2].chr($code+1); $len = 3; } } if($len >= 3) { if(isset($this->_trigrams[$check])) $result++; } } return $result; } //This function returns positions of the letters //on the keyboard. This one is default English layout function GetKeyboardLayout() { return array( "lo" => "` - ". "qwertyuiop[]". "asdfghjkl;'". "zxcvbnm,. ", "hi" => "~ ". "QWERTYUIOP{}". "ASDFGHJKL:\"". "ZXCVBNM<> " ); } function ConvertFromScancode($arScancode) { $result = ""; $keyboard = $this->GetKeyboardLayout(); foreach($arScancode as $code) $result .= mb_substr($keyboard["lo"], $code, 1); return $result; } public static function StrToArray($str) { if(defined("BX_UTF")) { $result = array(); $len = mb_strlen($str); for($i = 0;$i < $len; $i++) $result[] = mb_substr($str, $i, 1); return $result; } else { return str_split($str); } } //This function converts text between layouts public static function ConvertKeyboardLayout($text, $from, $to) { static $keyboards = array(); $combo = $from."|".$to; if(!isset($keyboards[$combo])) { //Fill local cache if(!array_key_exists($from, $keyboards)) { $ob = CSearchLanguage::GetLanguage($from); $keyboard = $ob->GetKeyboardLayout(); if(is_array($keyboard)) $keyboards[$from] = array_merge($ob->StrToArray($keyboard["lo"]), $ob->StrToArray($keyboard["hi"])); else $keyboards[$from] = null; } if(!array_key_exists($to, $keyboards)) { $ob = CSearchLanguage::GetLanguage($to); $keyboard = $ob->GetKeyboardLayout(); if(is_array($keyboard)) $keyboards[$to] = array_merge($ob->StrToArray($keyboard["lo"]), $ob->StrToArray($keyboard["hi"])); else $keyboards[$to] = null; } //when both layouts defined if(isset($keyboards[$from]) && isset($keyboards[$to])) { $keyboards[$combo] = array(); foreach($keyboards[$from] as $i => $ch) if($ch != false) $keyboards[$combo][$ch] = $keyboards[$to][$i]; } } if(isset($keyboards[$combo])) { if (defined("BX_UTF")) { $text = static::StrToArray($text); foreach ($text as $pos => $char) { if (isset($keyboards[$combo][$char])) $text[$pos] = $keyboards[$combo][$char]; } return implode('', $text); } else { return strtr($text, $keyboards[$combo]); } } else { return $text; } } //This function converts text into array of character positions //on the keyboard. Not defined chars turns into "false" value. function ConvertToScancode($text, $strict=false, $binary=false) { static $cache = array(); if(!isset($cache[$this->_lang_id])) { $cache[$this->_lang_id] = array(); $keyboard = $this->GetKeyboardLayout(); foreach($this->StrToArray($keyboard["lo"]) as $pos => $ch) $cache[$this->_lang_id][$ch] = $pos; foreach($this->StrToArray($keyboard["hi"]) as $pos => $ch) $cache[$this->_lang_id][$ch] = $pos; } $scancodes = &$cache[$this->_lang_id]; if($binary) { $result = ""; foreach($this->StrToArray($text) as $ch) { if( isset($scancodes[$ch]) && !($ch === " ") && !($strict && !isset($this->_abc[$ch])) ) $result .= chr($scancodes[$ch]+1); } } else { $result = array(); foreach($this->StrToArray($text) as $ch) { if($ch === " ") $result[] = false; elseif($strict && !isset($this->_abc[$ch])) $result[] = false; elseif(isset($scancodes[$ch])) $result[] = $scancodes[$ch]; else $result[] = false; } } return $result; } function PreGuessLanguage($text, $lang=false) { //Indicates that there is no own guess return false; //In subclasses you should return array("from" => lang, "to" => lang) to translate //or return true when no translation nedded //or parent::GuessLanguage for futher processing } public static function GuessLanguage($text, $lang=false) { if($text == '') return false; static $cache = array(); if(empty($cache)) { $cache[] = "en";//English is always in mind and on the first place $rsLanguages = CLanguage::GetList(); while($arLanguage = $rsLanguages->Fetch()) if($arLanguage["LID"] != "en") $cache[] = $arLanguage["LID"]; } if(is_array($lang)) $arLanguages = $lang; else $arLanguages = $cache; if(count($arLanguages) < 2) return false; //Give customized languages a chance to guess foreach($arLanguages as $lang) { $ob = CSearchLanguage::GetLanguage($lang); $res = $ob->PreGuessLanguage($text, $lang); if(is_array($res)) return $res; elseif($res === true) return false; } //First try to detect language which //was used to type the phrase $max_len = 0; $languages_from = array(); foreach($arLanguages as $lang) { $ob = CSearchLanguage::GetLanguage($lang); $arScanCodesTmp1 = $ob->ConvertToScancode($text, true); $_cnt = count(array_filter($arScanCodesTmp1)); if ($_cnt > $max_len) $max_len = $_cnt; $languages_from[$lang] = $arScanCodesTmp1; } if (empty($languages_from)) return false; if ($max_len < 2) return false; $languages_from = array_filter($languages_from, function($a) use($max_len) { return count(array_filter($a)) >= $max_len; } ); uasort($languages_from, function($a, $b) { return count(array_filter($b)) - count(array_filter($a)); } ); //If more than one language is detected as input //try to get one with best trigram info $arDetectionFrom = array(); $i = 0; foreach($languages_from as $lang => $arScanCodes) { $ob = CSearchLanguage::GetLanguage($lang); //Calculate how far sequence of scan codes //is from language model $deviation = $ob->GetDeviation($arScanCodes); $arDetectionFrom[$lang] = array( $ob->HasTrigrams(), $ob->CheckTrigrams($arScanCodes), $deviation[1], intval($deviation[0]*100), $i, ); $i++; } uasort($arDetectionFrom, array("CSearchLanguage", "cmp")); //Now try the best to detect the language $arDetection = array(); $i = 0; foreach($arDetectionFrom as $lang_from => $arTemp) { foreach($arLanguages as $lang) { $lang_from_to = $lang_from."=>".$lang; $arDetection[$lang_from_to] = array(); $ob = CSearchLanguage::GetLanguage($lang); $alt_text = CSearchLanguage::ConvertKeyboardLayout($text, $lang_from, $lang); $arScanCodes = $ob->ConvertToScancode($alt_text, true); $arDetection[$lang_from_to][] = $ob->HasBigrammInfo()? 0: 1; $arDetection[$lang_from_to][] = $ob->CheckTrigrams($arScanCodes); $arDetection[$lang_from_to][] = -count(array_filter($arScanCodes)); //Calculate how far sequence of scan codes //is from language model $deviation = $ob->GetDeviation($arScanCodes); $arDetection[$lang_from_to][] = $deviation[1]; $arDetection[$lang_from_to][] = $deviation[0]; $arDetection[$lang_from_to][] = $i; $arDetection[$lang_from_to][] = $lang_from_to; $i++; } } uasort($arDetection, array("CSearchLanguage", "cmp")); $language_from_to = key($arDetection); list($language_from, $language_to) = explode("=>", $language_from_to); $alt_text = CSearchLanguage::ConvertKeyboardLayout($text, $language_from, $language_to); if($alt_text === $text) return false; return array("from" => $language_from, "to" => $language_to); } //Compare to results of text analysis static function cmp($a, $b) { $c = count($a); for($i = 0; $i < $c; $i++) { if($a[$i] < $b[$i]) return -1; elseif($a[$i] > $b[$i]) return 1; } return 0;//never happens } //Function returns distance of the text (sequence of scan codes) //from language model function GetDeviation($arScanCodes) { //This is language model $lang_bigrams = $this->GetBigrammScancodeFreq(); $lang_count = $lang_bigrams["count"]; unset($lang_bigrams["count"]); //This is text model $text_bigrams = $this->ConvertToBigramms($arScanCodes); $count = $text_bigrams["count"]; unset($text_bigrams["count"]); $deviation = 0; $zeroes = 0; foreach($text_bigrams as $key => $value) { for ($i = 0;$i < $value; $i++) { if(!isset($lang_bigrams[$key])) { $zeroes++; $deviation += 1/$count; } else { $deviation += abs(1/$count - $lang_bigrams[$key]/$lang_count); } } } return array($deviation, $zeroes); } //Function returns bigramms of the text (array of scancodes) //For example "FAT RAT" will be //array("FA", "AT", "RA", "AT") //This is model of the text function ConvertToBigramms($arScancodes) { $result = array(); $len = count($arScancodes)-1; for($i = 0; $i < $len; $i++) { $code1 = $arScancodes[$i]; $code2 = $arScancodes[$i+1]; if($code1 !== false && $code2 !== false) { $result["count"]++; $result[$code1." ".$code2]++; } } return $result; } function HasBigrammInfo() { return $this->_has_bigramm_info; } //Function returns model of the language function GetBigrammScancodeFreq() { if(!$this->HasBigrammInfo()) return array("count"=>1); if(!isset($this->_lang_bigramm_cache)) { $bigramms = $this->GetBigrammLetterFreq(); $keyboard = $this->GetKeyboardLayout(); $keyboard_lo = $keyboard["lo"]; $keyboard_hi = $keyboard["hi"]; $result = array(); foreach($bigramms as $letter1 => $row) { $p1 = mb_strpos($keyboard_lo, $letter1); if($p1 === false) $p1 = mb_strpos($keyboard_hi, $letter1); $i = 0; foreach($bigramms as $letter2 => $tmp) { $p2 = mb_strpos($keyboard_lo, $letter2); if($p2 === false) $p2 = mb_strpos($keyboard_hi, $letter2); $weight = $row[$i]; $result["count"] += $weight; $result[$p1." ".$p2] = $weight; $i++; } } $this->_lang_bigramm_cache = $result; } return $this->_lang_bigramm_cache; } } ?>