From 7e78cb35b8a0870eaf3c29575c82f7cac1c493d0 Mon Sep 17 00:00:00 2001
From: Lars Moelleken
Keep the previous (default) array-to-separator array.
+ */ + public static function add_array_to_separator(array $array, bool $merge = true) + { + if ($merge === true) { + self::$arrayToSeparator = \array_unique( + \array_merge( + self::$arrayToSeparator, + $array + ) + ); + } else { + self::$arrayToSeparator = $array; + } + } + + /** + * Add new characters to the list. `$map` should be a hash. + * + * @param array $map * @param string|null $language - */ - public static function add_chars ($map, string $language = null) + */ + public static function add_chars(array $map, string $language = null) { - $language_key = $language ?? uniqid('urlify', true); + $language_key = $language ?? \uniqid('urlify', true); if (isset(self::$maps[$language_key])) { - self::$maps[$language_key] = array_merge($map, self::$maps[$language_key]); + self::$maps[$language_key] = \array_merge($map, self::$maps[$language_key]); } else { self::$maps[$language_key] = $map; } - } + } - /** - * Append words to the remove list. Accepts either single words - * or an array of words. - * @param mixed $words - */ - public static function remove_words ($words) - { - $words = is_array ($words) ? $words : array ($words); - self::$remove_list = array_unique (array_merge (self::$remove_list, $words)); - } - - /** - * Transliterates characters to their ASCII equivalents. + /** + * Transliterates characters to their ASCII equivalents. * $language specifies a priority for a specific language. * The latter is useful if languages have different rules for the same character. - * @param string $text - * @param string $language + * + * @param string $stringThe input string.
+ * @param string $languageYour primary language.
+ * @param string $unknownCharacter use if character unknown. (default is ?).
+ * * @return string - */ - public static function downcode ($text, $language = "") - { + */ + public static function downcode( + string $string, + string $language = 'en', + string $unknown = '' + ): string { + + $string = self::expandString($string, $language); + foreach (self::$maps as $mapsInner) { foreach ($mapsInner as $orig => $replace) { - $text = str_replace($orig, $replace, $text); + $string = \str_replace($orig, $replace, $string); } } $langSpecific = \voku\helper\ASCII::charsArrayWithOneLanguage($language, true); if (!empty($langSpecific)) { - $text = str_replace( + $string = \str_replace( $langSpecific['orig'], $langSpecific['replace'], - $text + $string ); } + foreach (\voku\helper\ASCII::charsArrayWithMultiLanguageValues(true) as $replace => $orig) { - $text = str_replace($orig, $replace, $text); + $string = \str_replace($orig, $replace, $string); } - return $text; - } + return \voku\helper\ASCII::to_transliterate($string, $unknown, false); + } - /** - * Filters a string, e.g., "Petty theft" to "petty-theft" - * @param string $text The text to return filtered - * @param int $length The length (after filtering) of the string to be returned - * @param string $language The transliteration language, passed down to downcode() - * @param bool $file_name Whether there should be and additional filter considering this is a filename - * @param bool $use_remove_list Whether you want to remove specific elements previously set in self::$remove_list - * @param bool $lower_case Whether you want the filter to maintain casing or lowercase everything (default) - * @param bool $treat_underscore_as_space Treat underscore as space, so it will replaced with "-" + /** + * Convert a String to URL. + * + * e.g.: "PettyThe text you want to convert.
+ * @param int $maxLengthMax. length of the output string, set to "0" (zero) to + * disable it
+ * @param string $languageThe language you want to convert to.
+ * @param bool $fileName+ * Keep the "." from the extension e.g.: "imaäe.jpg" => + * "image.jpg" + *
+ * @param bool $removeWords
+ * Remove some "words" from the string.
+ * Info: Set extra words via remove_words().
+ *
Use strtolower() at the end.
+ * @param bool|string $separatorDefine a new separator for the words.
+ * * @return string - */ - public static function filter ($text, $length = 60, $language = "", $file_name = false, $use_remove_list = true, $lower_case = true, $treat_underscore_as_space = true) + */ + public static function filter( + string $string, + int $maxLength = 200, + string $language = 'en', + bool $fileName = false, + bool $removeWords = false, + bool $strToLower = true, + $separator = '-' + ): string { + if ($string === '') { + return ''; + } + + // fallback + if ($language === '') { + $language = 'en'; + } + + // separator-fallback + if ($separator === false) { + $separator = '_'; + } + if ($separator === true || $separator === '') { + $separator = '-'; + } + + // escaped separator + $separatorEscaped = \preg_quote($separator, '/'); + + // use defaults, if there are no values + if (self::$arrayToSeparator === []) { + self::reset_array_to_separator(); + } + + // remove apostrophes which are not used as quotes around a string + $stringTmp = \preg_replace("/(\w)'(\w)/u", '${1}${2}', $string); + if ($stringTmp !== null) { + $string = (string) $stringTmp; + } + + // replace with $separator + // + + // remove all other html-tags + $string = \strip_tags( + (string) \preg_replace( + self::$arrayToSeparator, + $separator, + $string + ) + ); + + // use special language replacer + $string = self::downcode($string, $language); + + // replace with $separator, again + $string = (string) \preg_replace( + self::$arrayToSeparator, + $separator, + $string + ); + + // remove all these words from the string before urlifying + $removeWordsSearch = '//'; + if ($removeWords === true) { + $removeList = self::get_remove_list($language); + if ($removeList !== []) { + $removeWordsSearch = '/\b(?:' . \implode('|', $removeList) . ')\b/ui'; + } + } + + // keep the "." from e.g.: a file-extension? + if ($fileName) { + $removePatternAddOn = '.'; + } else { + $removePatternAddOn = ''; + } + + $string = (string) \preg_replace( + [ + '/[^' . $separatorEscaped . $removePatternAddOn . '\-a-zA-Z0-9\s]/u', + // 1) remove un-needed chars + '/[\s]+/u', + // 2) convert spaces to $separator + $removeWordsSearch, + // 3) remove some extras words + '/[' . ($separatorEscaped ?: ' ') . ']+/u', + // 4) remove double $separator's + '/[' . ($separatorEscaped ?: ' ') . ']+$/u', + // 5) remove $separator at the end + ], + [ + '', + $separator, + '', + $separator, + '', + ], + $string + ); + + // "substr" only if "$length" is set + if ( + $maxLength + && + $maxLength > 0 + && + \strlen($string) > $maxLength + ) { + $string = (string) \substr(\trim($string, $separator), 0, $maxLength); + } + + // convert to lowercase + if ($strToLower === true) { + $string = \strtolower($string); + } + + // trim "$separator" from beginning and end of the string + return \trim($string, $separator); + } + + /** + * Append words to the remove list. Accepts either single words or an array of words. + * + * @param string|string[] $words + * @param string $language + * @param bool $mergeKeep the previous (default) remove-words array.
+ */ + public static function remove_words($words, string $language = 'en', bool $merge = true) { - $text = self::downcode ($text,$language); - - if ($use_remove_list) { - // remove all these words from the string before urlifying - $text = preg_replace ('/\b(' . implode ('|', self::$remove_list) . ')\b/i', '', $text); - } - - // if downcode doesn't hit, the char will be stripped here - $remove_pattern = ($file_name) ? '/[^_\-.\-a-zA-Z0-9\s]/u' : '/[^\s_\-a-zA-Z0-9]/u'; - $text = preg_replace ($remove_pattern, '', $text); // remove unneeded chars - if ($treat_underscore_as_space) { - $text = str_replace ('_', ' ', $text); // treat underscores as spaces - } - $text = preg_replace ('/^\s+|\s+$/u', '', $text); // trim leading/trailing spaces - $text = preg_replace ('/[-\s]+/u', '-', $text); // convert spaces to hyphens - if ($lower_case) { - $text = strtolower ($text); // convert to lowercase - } - - return trim (substr ($text, 0, $length), '-'); // trim to first $length chars - } - - /** - * Alias of `URLify::downcode()`. - */ - public static function transliterate ($text) + if (\is_array($words) === false) { + $words = [$words]; + } + + /** @noinspection ForeachSourceInspection */ + foreach ($words as $removeWordKey => $removeWord) { + $words[$removeWordKey] = \preg_quote($removeWord, '/'); + } + + if ($merge === true) { + self::$remove_list[$language] = \array_unique( + \array_merge( + self::get_remove_list($language), + $words + ) + ); + } else { + self::$remove_list[$language] = $words; + } + } + + /** + * Reset the internal "self::$arrayToSeparator" to the default values. + */ + public static function reset_array_to_separator() { - return self::downcode ($text); - } + self::$arrayToSeparator = [ + '/"|&|<|>|–|—/i', // ", &, <, >, –, — + '/⁻|-|—|_|"|`|´|\'/', + "#/\r\n|\r|\n|