* @copyright 2007 Michel Corne
* @license http://www.opensource.org/licenses/bsd-license.php The BSD License
* @version Release: @package_version@
* @link http://pear.php.net/package/I18N_UnicodeNormalizer
* @link http://www.unicode.org/Public/UNIDATA/
* @see I18N_UnicodeNormalizer::$compiled for the list of compiled files.
*/
class I18N_UnicodeNormalizer_Compiler
{
/**
* Code point format
*
* 'utf8': Code point conversion from Unicode to UTF-8 characters
* 'ucn': Code point conversion from Unicode to their UCN format, e.g. \u000A
*
*
* @var string
*/
private $codeFormat;
/**
* The name list of compiled files
*
* @var array
* @access private
*/
private $compiled;
/**
* The Unicode.org data files
*
* @var array
*/
private $data = array(// /
'corrections' => 'NormalizationCorrections.txt',
'derived_norm' => 'DerivedNormalizationProps.txt',
'norm_test' => 'NormalizationTest-3.2.0.txt',
'unicode_data' => 'UnicodeData.txt',
);
/**
* The I18N_UnicodeNormalizer_File class instance
*
* @var object
* @access private
*/
private $file;
/**
* The force-compilation flag
*
* Forces files to be (re)compiled if true. Files are (re)compiled if not
* done yet if false.
*
* @var boolean
* @access private
*/
private $forceCompile;
/**
* Range limit below which a Unicode code point range is expanded
*
* @var integer
* @access private
*/
private $rangeLimit;
/**
* The I18N_UnicodeNormalizer_String class instance
*
* @var object
* @access private
*/
private $string;
/**
* The class constructor
*
* Sets the Unicode code point format, the force-compilation option,
* the code points range limit. Sets the paths of the data and compiled
* file names
*
* @param string $dir the data/compiled files base directory,
* the default is set by the normalizer
* @param string $codeFormat 'utf8': for production, or 'ucn': for
* testing purposes, the default is 'utf8'
* @param boolean $forceCompile compilation option: files are to be
* (re)compiled if true,
* files are recompiled as needed if false
* @param integer $limitedDataSet compiles a small subset of data
* for testing/coverage purposes if true,
* or use all data otherwise
* @return void
* @access public
*/
public function __construct($dir = '', $codeFormat = '', $forceCompile = false, $limitedDataSet = false)
{
// sets the Unicode code point format, the force compilation flag, the range limit for tests
$this->codeFormat = $codeFormat == 'ucn'? 'ucn' : 'utf8';
$this->forceCompile = $forceCompile;
$this->rangeLimit = $limitedDataSet? 2 : 12000;
// gets the file names to compile, changes the file names to UCN if requested
$this->compiled = I18N_UnicodeNormalizer::getFileNames($dir);
$this->codeFormat == 'ucn' and $this->compiled = str_replace('utf8', 'ucn', $this->compiled);
// builds the unicodeata file base paths, prepends the path to the file names
$dir or $dir = I18N_UnicodeNormalizer::getDataDir();
$dir .= '/unicodedata/';
$data = substr_replace($this->data, $dir, 0, 0);
$this->data = array_combine(array_keys($this->data), $data);
$this->file = new I18N_UnicodeNormalizer_File();
$this->string = new I18N_UnicodeNormalizer_String();
}
/**
* Compiles the Unicode composition, decomposition and combining text data files into PHP
*
* @return void
* @access public
*/
public function compileAll()
{
// compiles the quick check data
$this->compileDerivedNorm('quick_check_nfc', 'NFC_QC');
$this->compileDerivedNorm('quick_check_nfd', 'NFD_QC');
$this->compileDerivedNorm('quick_check_nfkc', 'NFKC_QC');
$this->compileDerivedNorm('quick_check_nfkd', 'NFKD_QC');
// compiles the exclusions
$this->compileDerivedNorm('exclusions', 'Full_Composition_Exclusion');
// generates the Hangul decompositions and compositions
$this->compileHangulDecomp();
$this->compileCompositions('hangul_compos', 'hangul_decomp');
// compiles the corrections
$this->compileCorrections();
// compiles the Unicode main data
$this->compileUnicodeData();
// generates the compositions
$this->compileCompositions('compositions', 'canonical_decomp');
// compiles the base tests, compiles the Hangul tests
$this->compileTest('test_base', 'HANGUL', false);
$this->compileTest('test_hangul', 'HANGUL', true);
}
/**
* Generates the compositions
*
* @param string $composFileName the composition file name
* @param string $decompFileName the decomposition file name
* @return void
* @access private
*/
private function compileCompositions($composFileName, $decompFileName)
{
if ($this->forceCompile or !file_exists($this->compiled[$composFileName])) {
// the compositions are not or must be compiled
// loads the canonical decomposition mappings, loads the exclusions
$decompositions = require($this->compiled[$decompFileName]);
$exclusions = require($this->compiled['exclusions']);
// excludes some code points, flips the decompositions to create the compositions
$decompositions = array_diff_key($decompositions, $exclusions);
$compositions = array_flip($decompositions);
// adds the Hangul compositions
$composFileName != 'hangul_compos' and
$compositions += require($this->compiled['hangul_compos']);
// writes the compiled compositions
$this->file->put($this->compiled[$composFileName], $compositions,
__CLASS__ . '::' . __FUNCTION__, $this->compiled[$decompFileName]);
}
}
/**
* Compiles the corrections
*
* @return void
* @access private
*/
private function compileCorrections()
{
if ($this->forceCompile or !file_exists($this->compiled['corrections'])) {
// the corrections are not or must be compiled
// parses the corrections, extracts the code point decomposition corrections
$corrections = $this->parseFile($this->data['corrections']);
array_walk($corrections, create_function('&$value', '$value = $value[2];'));
// writes the compiled corrections
$this->file->put($this->compiled['corrections'], $corrections,
__CLASS__ . '::' . __FUNCTION__, $this->data['corrections']);
}
}
/**
* Compiles and expands the decomposition mappings including Hangul
*
* @param array $decomposition the decomposition mappings
* @param string $fileName the decomposition file name
* @param string $fileNameX the expanded decomposition file name
* @return array the expanded decomposition mappings
* @access private
*/
private function compileDecomp($decomposition, $fileName, $fileNameX)
{
// writes the compiled decomposition mappings
$this->file->put($this->compiled[$fileName], $this->implode($decomposition),
__CLASS__ . '::' . __FUNCTION__, $this->data['unicode_data']);
foreach(array_keys($decomposition) as $code) {
// expands the code point decomposition
$this->expandDecomp($code, $decomposition);
}
// adds the hangul decompostions mappings
$hangulDecomp = require($this->compiled['hangul_decomp']);
// writes the compiled expanded decompositions mappings
$this->file->put($this->compiled[$fileNameX], $this->implode($decomposition) + $hangulDecomp,
__CLASS__ . '::' . __FUNCTION__, $this->compiled[$fileName]);
return $decomposition;
}
/**
* Compiles the derived normalization data to generates the quick-checks or the exclusions
*
* Extracts the data lines that include a specific keyword. Compiles that data.
*
* @param string $compiledName the name of the compiled file
* @param string $keyword the keyword within the data lines to extract,
* e.g. 'NFC_QC', 'NFD_QC', 'NFKC_QC', 'NFKD_QC',
* 'Full_Composition_Exclusion'
* @return void
* @access private
*/
private function compileDerivedNorm($compiledName, $keyword)
{
if ($this->forceCompile or !file_exists($this->compiled[$compiledName])) {
// the file is not or must be compiled
// parses/filters the derived normalizations, sets the property value
// to true if present, note: treating both quick check M[aybe] as N[o] as No
$parsed = $this->parseFile($this->data['derived_norm'], $keyword);
array_walk($parsed, create_function('&$value', '$value = true;'));
// writes the compiled file
$this->file->put($this->compiled[$compiledName], $parsed,
__CLASS__ . '::' . __FUNCTION__, $this->data['derived_norm']);
}
}
/**
* Creates the Hangul decomposition mappings
*
* @return void
* @access private
* @link http://www.unicode.org/unicode/reports/tr15/#Hangul
*/
private function compileHangulDecomp()
{
if ($this->forceCompile or !file_exists($this->compiled['hangul_decomp'])) {
// the hangul decomposition mappings are not or must be compiled
$sbase = 0xAC00;
$lbase = 0x1100;
$vbase = 0x1161;
$tbase = 0x11a7;
$lcount = 19;
$vcount = 21;
$tcount = 28;
$ncount = $vcount * $tcount;
$scount = $lcount * $ncount;
// limits the code points range
$scount = min($scount, $this->rangeLimit);
for($sindex = 0; $sindex < $scount; $sindex++) {
// computes the L, V, and T syllable code point unicode values
$l = $lbase + $sindex / $ncount;
$v = $vbase + ($sindex % $ncount) / $tcount;
$t = $tbase + $sindex % $tcount;
// converts the L, V, T syllable to UTF-8 or UCN, concatenates L+V+T
$hangulCode = $this->convertCode($l);
$hangulCode .= $this->convertCode($v);
$t != $tbase and $hangulCode .= $this->convertCode($t);
$code = $sindex + $sbase;
$decomposition[$this->convertCode($code)] = $hangulCode;
}
// writes the compiled hangul decomposition mappings
$this->file->put($this->compiled['hangul_decomp'], $decomposition,
__CLASS__ . '::' . __FUNCTION__);
}
}
/**
* Compiles the Unicode.org regression test file
*
* Extracts the tests lines that include (or not if specified) a specific
* keyword. Compiles those lines.
*
* @param string $fileName the compiled file name
* @param string $keyword the keyword within the test lines to extract,
* e.g. "HANGUL"
* @param boolean $includeKeyword extracts the lines with the keyword if true,
* or without if false
* @return void
* @access private
*/
private function compileTest($fileName, $keyword, $includeKeyword = true)
{
if ($this->forceCompile or !file_exists($this->compiled[$fileName])) {
// the test is not or must be compiled
// loads the corrections, parses the test data
$corrections = require($this->compiled['corrections']);
$parsed = $this->parseFile($this->data['norm_test'], $keyword, $includeKeyword, false);
$normTest = array();
foreach($parsed as $line => $test) {
if (count($test) == 6) {
// a proper test, note: excluding the section titles,
// e.g. @Part 0 # Specific cases, ignore
// removes the empty trailing entry
unset($test[5]);
// extracts the first test entry: a single code point or a code list
$code = current($test);
// splits and converts the code point
$formattedCode = $this->splitCodes($code);
if (count($formattedCode) == 1) {
// the first test entry is a single code point, not a code list
// extracts the formatted code point
$formattedCode = current($formattedCode);
// the code point has a decomposition correction,
// resets the test entries to the code point
// + 4 times the same decomposition correction
isset($corrections[$formattedCode]) and
$test = array($code) + array_fill(1, 4, $corrections[$formattedCode]);
}
// extracts the test code point list
$test = array_map(array($this, 'splitCodes'), $test);
// concatenates the code points to create the UTF-8 string
$test = $this->implode($test);
// reindexes the test according to the data file column numbers
$normTest[$line] = array_combine(array(1, 2, 3, 4, 5), $test);
}
}
// writes the compiled normalization test
$this->file->put($this->compiled[$fileName], $normTest,
__CLASS__ . '::' . __FUNCTION__, $this->data['norm_test']);
}
}
/**
* Compiles the combining file and the decomposition files
*
* @return void
* @access private
*/
private function compileUnicodeData()
{
if ($this->forceCompile or !file_exists($this->compiled['combining']) or
!file_exists($this->compiled['canonical_decomp']) or
!file_exists($this->compiled['canonical_decomp_x']) or
!file_exists($this->compiled['compat_decomp']) or
!file_exists($this->compiled['compat_decomp_x'])) {
// the combining and decompositions files are not or must be compiled
// parses the unicode data, loads the corrections
$unicodeData = $this->parseFile($this->data['unicode_data']);
$corrections = require($this->compiled['corrections']);
foreach($corrections as $code => $correctDecomp) {
// checks there is a code point for this correction
isset($unicodeData[$code]) or
die('Error: missing code point with a correction in ' . $this->data['unicode_data']);
$unicodeData[$code][5] = $correctDecomp;
}
// not expecting ranges for canonical class combining and decomposition mappings
unset($unicodeData['code_ranges']);
foreach($unicodeData as $code => $properties) {
$details = array();
// captures the canonical combining class, extracts the decompositions details
$combiningClass = (int)$properties[3] and $combining[$code] = $combiningClass;
$mapping = $properties[5];
if ($mapping != '') {
if ($mapping{0} != '<') {
// a canonical decomposition, extracts the canonical decomposition
$canonicalDecomp[$code] = $this->splitCodes($mapping);
} else {
// a compatibility decomposition, extracts the compatibility decomposition
$mapping = substr($mapping, strpos($mapping, '>') + 1) and
$compatDecomp[$code] = $this->splitCodes($mapping);
}
}
}
// writes the compiled combining classes
$this->file->put($this->compiled['combining'], $combining,
__CLASS__ . '::' . __FUNCTION__, $this->data['unicode_data']);
// compiles the canonical decomposition
$canonicalDecomp = $this->compileDecomp($canonicalDecomp, 'canonical_decomp', 'canonical_decomp_x');
// adds the expanded canonical decompositions to the compatibility decomposition mappings
$compatDecomp += $canonicalDecomp;
// compiles the compatibility decompositions mappings
$this->compileDecomp($compatDecomp, 'compat_decomp', 'compat_decomp_x');
}
}
/**
* Converts the Unicode code point to a UTF-8 character or its UCN format
*
* @param string $code the Unicode code point
* @return string the UTF-8 character or the Unicode code point in its UCN format
* @access public
*/
public function convertCode($code)
{
// converts hexadecimal code to binary, e.g. 000A, or 0x000A, or \u000A to 10
is_string($code) and $code = hexdec($code);
// converts code to UCN hexadecimal string or packs code to UTF-8 binary string
return $this->codeFormat == 'ucn'? $this->string->dec2ucn($code) : $this->string->unicode2char($code);
}
/**
* Expands the decomposition mappings
*
* @param string $code the Unicode code point
* @param array &$decomposition the code point decomposition mappings
* @return boolean the code point was expanded if true, or has no
* decomposition if false
* @access private
*/
private function expandDecomp($code, &$decomposition)
{
static $self = __FUNCTION__; // a recursive function
static $isExpanded;
// the code point decomposition is being expanded, error: entering infinite loop
isset($isExpanded[$code]) and die('Error: recursion loop in ' . __CLASS__ . '::' . __FUNCTION__);
if (!isset($decomposition[$code])) {
// the code point has no decomposition
return false;
}
// the code point has a decomposition
// flags the code point decomposition as being expanded
$isExpanded[$code] = true;
$decomposed = array();
foreach($decomposition[$code] as $mappedCode) {
if ($this->$self($mappedCode, $decomposition)) {
// the mapped code has a decomposition, adds the mapped code decomposition
$decomposed = array_merge($decomposed, $decomposition[$mappedCode]);
} else {
// the mapped code has no decomposition, adds the mapped code itself
$decomposed[] = $mappedCode;
}
}
// updates the code point decomposition, the code point decomposition is now expanded
$decomposition[$code] = $decomposed;
unset($isExpanded[$code]);
return true;
}
/**
* Gets the name list of the compiled files
*
* @return array the name list of compiled files
* @access public
*/
public function getFileNames()
{
return $this->compiled;
}
/**
* Implodes an array's arrays
*
* @param array $array the array of arrays
* @return array an array of strings
* @access public
*/
public function implode($array)
{
static $callback = null;
// create the function that implodes an array
$callback or $callback = create_function('$values', 'return implode("", $values);');
return array_map($callback, $array);
}
/**
* Parses a Unicode text data file
*
* Can extract only the lines that have (or do not have if specified) a keyword.
* Expands ranges of Unicode code points. Converts the Unicode code points
* to UTF-8 characters or their UCN format.
*
* @param string $fileName the file to parse
* @param string $keyword the keyword within the test lines to extract,
* e.g. "HANGUL"
* @param boolean $includeKeyword extracts the lines with the keyword if true,
* or without if false
* @param boolean $codeIndexing entries are to be indexed with the code point
* as a key if true, or with the line number
* @return array the parsed data
* @access private
*/
private function parseFile($fileName, $keyword = null, $includeKeyword = true, $codeIndexing = true)
{
static $firstCode;
static $firstProperties = null;
$data = array();
foreach(file($fileName) as $line => $string) {
if ($keyword === null or (strpos($string, $keyword) !== false) === $includeKeyword) {
// the string contains (or not) the keyword
// removes the comments, extracts the properties, trims the properties
list($string) = explode('#', $string);
$properties = explode(';', trim($string));
$properties = array_map('trim', $properties);
if (($codeRange = current($properties)) != '') {
// not an empty/comment line, captures the code range or the single code
if (!$codeIndexing) {
// file line number indexing, captures the properties
$data[$line + 1] = $properties;
} else {
// entries to be indexed with the code point value
if (($name = next($properties)) !== false) {
// the second entry, typically a name, is not empty
if (strpos($name, ', First')) {
// the first code value of a range,
// e.g.
// captures the first code value
$firstCode = $codeRange;
$firstProperties = $properties;
continue;
} else if (strpos($name, ', Last')) {
// the last code value of a range,
// e.g.
// note: the first and last code entries are expected
// to follow each other, see UnicodeData.txt
// sets the range with the previously stored
// first and last code values
$codeRange = $firstCode . '..' . $codeRange;
}
}
// extracts the possible code range, converts the first
// and last code values to binary
$range = explode('..', $codeRange);
$firstCode = hexdec(current($range));
$lastCode = next($range) and
$lastCode = hexdec($lastCode) or $lastCode = $firstCode;
if (($lastCode - $firstCode) <= $this->rangeLimit) {
// the range is to be expanded
// changes the array to string if there is only one property
count($properties) == 1 and $properties = $codeRange;
for($code = $firstCode; $code <= $lastCode; $code++) {
// scans the range
$formatted = $this->convertCode($code);
isset($data[$formatted]) and
die('Error: not expecting duplicate code point entries in ' . $fileName);
// indexes the properties with code
$data[$formatted] = $properties;
}
} else {
// a large range
if ($this->codeFormat == 'ucn') {
// the code format is hexadecimal
// changes the range first and last codes to hexadecimal
$firstCode = $this->convertCode($firstCode);
$lastCode = $this->convertCode($lastCode);
}
// else if: UTF-8, leave the range codes in decimal
// captures the range and properties
$rangeList[] = array($firstCode => $firstProperties, $lastCode => $properties);
}
}
}
}
}
// adds the ranges to the end
isset($rangeList) and $data['code_ranges'] = $rangeList;
return $data;
}
/**
* Splits and converts a Unicode code points string to UTF-8 characters or their UCN format
*
* @param string $codes the Unicode code points string,
* e.g. a decomposition mapping: 0020 0308
* @return array the UTF-8 characters or their UCN format
* @access public
*/
public function splitCodes($codes)
{
// splits the string into code points
$codes = preg_split('~[^\da-f]~i', $codes, -1, PREG_SPLIT_NO_EMPTY) and
// converts the code points to UTF-8 characters or their UCN format
$codes = array_map(array($this, 'convertCode'), $codes);
return $codes;
}
}
?>