Your IP : 3.133.124.80


Current Path : /var/www/www-root/data/www.catalog.monolith-realty.ru/bitrix/modules/search/tools/en/
Upload File :
Current File : /var/www/www-root/data/www.catalog.monolith-realty.ru/bitrix/modules/search/tools/en/stemming.php

<?php
global $STEMMING_EN_STEP2A;
$STEMMING_EN_STEP2A = array(
	"TIONAL" => "TION", "ENCI" => "ENCE", "ANCI" => "ANCE", "ABLI" => "ABLE", "ENTLI" => "ENT",
	"IZER" => "IZE", "IZATION" => "IZE", "ATIONAL" => "ATE", "ATION" => "ATE", "ATOR" => "ATE",
	"ALISM" => "AL", "ALITI" => "AL", "ALLI" => "AL", "FULNESS" => "FUL", "OUSLI" => "OUS",
	"OUSNESS" => "OUS", "IVENESS" => "IVE", "IVITI" => "IVE", "BILITI" => "BLE", "BLI" => "BLE",
	"FULLI" => "FUL", "LESSLI" => "LESS"
);
global $STEMMING_EN_STEP2;
$STEMMING_EN_STEP2 = "/(".implode("|", array_keys($STEMMING_EN_STEP2A))."|OGI|LI)$/";
global $STEMMING_EN_STEP3A;
$STEMMING_EN_STEP3A = array(
	"TIONAL" => "TION", "ATIONAL" => "ATE", "ALIZE" => "AL", "ICATE" => "IC", "ICITI" => "IC",
	"ICAL" => "IC", "FUL" => "", "NESS" => ""
);
global $STEMMING_EN_STEP3;
$STEMMING_EN_STEP3 = "/(".implode("|", array_keys($STEMMING_EN_STEP3A))."|ATIVE)$/";
global $STEMMING_EN_STEP4A;
$STEMMING_EN_STEP4A = array(
	"AL", "ANCE", "ENCE", "ER", "IC",
	"ABLE", "IBLE", "ANT", "EMENT", "MENT",
	"ENT", "ISM", "ATE", "ITI", "OUS",
	"IVE", "IZE"
);
global $STEMMING_EN_STEP4;
$STEMMING_EN_STEP4 = "/(".implode("|", $STEMMING_EN_STEP4A)."|ION)$/";
global $STEMMING_EN_EX1;
$STEMMING_EN_EX1 = array(
	"SKIS" => "SKI",
	"SKIES" => "SKY",
	"DYING" => "DIE",
	"LYING" => "LIE",
	"TYING" => "TIE",
	"IDLY" => "IDL",
	"GENTLY" => "GENTL",
	"UGLY" => "UGLI",
	"EARLY" => "EARLI",
	"ONLY" => "ONLI",
	"SINGLY" => "SINGL",
	"SKY" => "SKY",
	"NEWS" => "NEWS",
	"HOWE" => "HOWE",
	"ATLAS" => "ATLAS",
	"COSMOS" => "COSMOS",
	"BIAS" => "BIAS",
	"ANDES" => "ANDES",
);
global $STEMMING_EN_EX2;
$STEMMING_EN_EX2 = array(
	"INNING" => 1,
	"OUTING" => 1,
	"CANNING" => 1,
	"HERRING" => 1,
	"EARRING" => 1,
	"PROCEED" => 1,
	"EXCEED" => 1,
	"SUCCEED" => 1,
);
function stemming_letter_en()
{
	return "qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM";
}

function stemming_stop_en($sWord)
{
	if (mb_strlen($sWord) < 2)
		return false;
	static $stop_list = false;
	if (!$stop_list)
	{
		$stop_list = array(
			"QUOTE" => 0, "HTTP" => 0, "WWW" => 0, "RU" => 0, "IMG" => 0, "GIF" => 0, "A" => 0, "THE" => 0, "IS" => 0,
			"ARE" => 0, "OFF" => 0, "ON" => 0, "AND" => 0, "IN" => 0, "FOR" => 0, "OF" => 0, "BY" => 0, "WITH" => 0,
			"BE" => 0, "WAS" => 0, "IT" => 0,
		);
		if (defined("STEMMING_STOP_EN"))
		{
			foreach (explode(",", STEMMING_STOP_EN) as $word)
			{
				$word = trim($word);
				if ($word <> '')
					$stop_list[$word] = 0;
			}
		}
	}
	return !array_key_exists($sWord, $stop_list);
}

function stemming_upper_en($sText)
{
	return ToUpper($sText);
}

function stemming_en($word)
{
	global $STEMMING_EN_STEP2A;
	global $STEMMING_EN_STEP2;
	global $STEMMING_EN_STEP3A;
	global $STEMMING_EN_STEP3;
	global $STEMMING_EN_STEP4A;
	global $STEMMING_EN_STEP4;
	global $STEMMING_EN_EX1;
	global $STEMMING_EN_EX2;

	//If the word has two letters or less, leave it as it is.
	$word_len = mb_strlen($word);
	if ($word_len <= 2)
		return $word;
	if (array_key_exists($word, $STEMMING_EN_EX1))
		return $STEMMING_EN_EX1[$word];

	//Set initial y, or y after a vowel, to Y, and then establish the regions R1 and R2. (See  note on vowel marking.)
	$vowels = "AEIOUY";
	$word = preg_replace("/^Y/", "y", $word);
	$word = preg_replace("/([$vowels])(Y)/", "\\1y", $word);

	//In any word, R1 is the region after the first non-vowel following a vowel, or the end of the word if it contains no such a non-vowel.
	$R1 = 0;
	while (($R1 < $word_len) && (mb_strpos($vowels, mb_substr($word, $R1, 1)) === false))
		$R1++;
	while (($R1 < $word_len) && (mb_strpos($vowels, mb_substr($word, $R1, 1)) !== false))
		$R1++;
	if ($R1 < $word_len)
		$R1++;
	if (preg_match("/^COMMUN/", $word))
		$R1 = 6;
	if (preg_match("/^GENER/", $word))
		$R1 = 5;

	$R2 = $R1;
	while (($R2 < $word_len) && (mb_strpos($vowels, mb_substr($word, $R2, 1)) === false))
		$R2++;
	while (($R2 < $word_len) && (mb_strpos($vowels, mb_substr($word, $R2, 1)) !== false))
		$R2++;
	if ($R2 < $word_len)
		$R2++;

	//Step 1a:
	//	Search for the longest among the following suffixes, and perform the action indicated.
	$found = array();
	if (preg_match("/(SSES|IED|IES|US|SS|S)$/", $word, $found))
	{
		switch ($found[0])
		{
			//sses - replace by ss
		case "SSES":
			$word = mb_substr($word, 0, $word_len - 4)."SS";
			break;
			//ied+   ies* - replace by i if preceded by more than one letter, otherwise by ie  (so ties -> tie, cries -> cri)
		case "IED":
		case "IES":
			if (mb_strlen($word) > 4)
				$word = mb_substr($word, 0, $word_len - 3)."I";
			else
				$word = mb_substr($word, 0, $word_len - 3)."IE";
			break;
			//s  delete if the preceding word part contains a vowel not immediately before the s
			//   (so gas and this retain the s, gaps and kiwis lose it)
		case "S":
			if (preg_match("/([$vowels].*.)(S)$/", $word))
				$word = mb_substr($word, 0, $word_len - 1);
			break;
			//us+   ss - do nothing
		}
	}

	if (array_key_exists($word, $STEMMING_EN_EX2))
		return $word;

	//Step 1b:
	//	Search for the longest among the following suffixes, and perform the action indicated.
	//eed   eedly+ - replace by ee if in R1
	if (preg_match("/(EEDLY|INGLY|EDLY|EED|ING|ED)$/", $word, $found))
	{
		switch ($found[0])
		{
		case "EEDLY":
		case "EED":
			if (preg_match("/".$found[0]."$/", mb_substr($word, $R1)))
				$word = mb_substr($word, 0, mb_strlen($word) - mb_strlen($found[0]))."EE";
			break;
		default:
			//delete if the preceding word part contains a vowel, and then
			if (($step1b = preg_replace("/([$vowels].*)(ED|EDLY|ING|INGLY)$/", "\\1", $word)) != $word)
			{
				//if the word ends at, bl or iz add e (so luxuriat -> luxuriate), or
				if (($step1b1 = preg_replace("/(AT|BL|IZ)$/", "\\1E", $step1b)) == $step1b)
					//if the word ends with a double remove the last letter (so hopp -> hop), or
					if (preg_match("/(BB|DD|FF|GG|MM|NN|PP|RR|TT)$/", $step1b))
						$step1b1 = mb_substr($step1b, 0, mb_strlen($step1b) - 1);
					else
					{
						//if the word is short, add e (so hop -> hope)
						//A word is called short if it consists of a short syllable preceded by zero or more consonants.
						//Define a short syllable in a word as either (a) a vowel followed by a non-vowel other than w, x or Y
						//and preceded by a non-vowel, or * (b) a vowel at the beginning of the word followed by a non-vowel.
						if (preg_match("/^[^$vowels]+[$vowels][^WXy$vowels]$/", $step1b)
							|| preg_match("/^[$vowels][^$vowels]$/", $step1b)
						)
							$step1b1 = $step1b."E";
					}
				$step1b = $step1b1;
			}
			$word = $step1b;
		}
	}

	//Step 1c: *
	//	replace suffix y or Y by i if preceded by a non-vowel which is not the first letter of the word (so cry -> cri, by -> by, say -> say)
	$word = preg_replace("/^(.+[^$vowels])([yY])$/", "\\1I", $word);

	//Step 2:
	//	Search for the longest among the following suffixes, and, if found and in R1, perform the action indicated.
	if (
		preg_match($STEMMING_EN_STEP2, $word, $found)
		&& preg_match("/".$found[0]."$/", mb_substr($word, $R1))
	)
	{
		switch ($found[0])
		{
		case "OGI":
			if (preg_match("/LOGI$/", $word))
				$word = mb_substr($word, 0, mb_strlen($word) - 3)."OG";
			break;
		case "LI":
			if (preg_match("/[CDEGHKMNRT]LI$/", $word))
				$word = mb_substr($word, 0, mb_strlen($word) - 2);
			break;
		default:
			$word = mb_substr($word, 0, mb_strlen($word) - mb_strlen($found[0])).$STEMMING_EN_STEP2A[$found[0]];
		}
	}

	//Step 3:
	//	Search for the longest among the following suffixes, and, if found and in R1, perform the action indicated.
	if (
		preg_match($STEMMING_EN_STEP3, $word, $found)
		&& preg_match("/".$found[0]."$/", mb_substr($word, $R1))
	)
	{
		switch ($found[0])
		{
		case "ATIVE":
			if (preg_match("/ATIVE$/", mb_substr($word, $R2)))
				$word = mb_substr($word, 0, mb_strlen($word) - 5);
			break;
		default:
			$word = mb_substr($word, 0, mb_strlen($word) - mb_strlen($found[0])).$STEMMING_EN_STEP3A[$found[0]];
		}
	}

	//Step 4:
	//	Search for the longest among the following suffixes, and, if found and in R2, perform the action indicated.
	if (
		preg_match($STEMMING_EN_STEP4, $word, $found)
		&& preg_match("/".$found[0]."$/", mb_substr($word, $R2))
	)
	{
		switch ($found[0])
		{
		case "ION":
			if (preg_match("/[ST]ION$/", $word))
				$word = mb_substr($word, 0, mb_strlen($word) - mb_strlen($found[0]));
			break;
		default:
			$word = mb_substr($word, 0, mb_strlen($word) - mb_strlen($found[0]));
		}
	}

	//Step 5:
	if (
		preg_match("/E$/", mb_substr($word, $R2))
		|| (
			preg_match("/E$/", mb_substr($word, $R1))
			//Define a short syllable in a word as either (a) a vowel followed by a non-vowel other than w, x or Y
			//and preceded by a non-vowel, or * (b) a vowel at the beginning of the word followed by a non-vowel.
			&& !(
				preg_match("/[^$vowels][$vowels][^WXy$vowels].$/", $word)
				|| preg_match("/^[$vowels][^$vowels].$/", $word)
			)
		)
	)
	{
		$word = mb_substr($word, 0, mb_strlen($word) - 1);
	}
	elseif (preg_match("/L$/", mb_substr($word, $R2)) && preg_match("/LL$/", $word))
	{
		$word = mb_substr($word, 0, mb_strlen($word) - 1);
	}

	return str_replace("y", "Y", $word);
}