Skip to content

Commit

Permalink
Merge pull request #8 from photogabble/dev-1.1.0
Browse files Browse the repository at this point in the history
1.1.0 release
  • Loading branch information
carbontwelve authored Feb 17, 2022
2 parents 80201a7 + cbc8478 commit a389c80
Show file tree
Hide file tree
Showing 11 changed files with 12,656 additions and 47 deletions.
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,11 @@ Please see the [tests](https://github.com/photogabble/php-confusable-homoglyphs/

* [Laravel Registration Validator package ](https://github.com/photogabble/laravel-registration-validator)
* If you use this package in your open source project please create a pull request to add a link here

## Is the data up to date?

This project currently ships with unicode consortium public data version 10.0.0.

The unicode blocks aliases and names for each character are extracted from [this file](http://www.unicode.org/Public/UNIDATA/Scripts.txt) provided by the unicode consortium. The version this project currently ships with was generated on the 11th March 2017.

The matrix of which character can be confused with which other characters is built using [this file](http://www.unicode.org/Public/security/latest/confusables.txt) provided by the unicode consortium. The version this project currently ships with was generated on the 8th April 2017.
1 change: 1 addition & 0 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
],
"require": {
"php": ">=7.0.0",
"ext-json": "*",
"symfony/polyfill-mbstring": ">=1.3"
},
"require-dev": {
Expand Down
2 changes: 1 addition & 1 deletion src/Categories.php
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ public function uniqueAliases(string $string) : array
foreach (preg_split('//u', $string, -1, PREG_SPLIT_NO_EMPTY) as $char) {
$alias = $this->alias($char);
if (! in_array($alias, $return)) {
array_push($return, $alias);
$return[] = $alias;
}
}
return $return;
Expand Down
121 changes: 121 additions & 0 deletions src/Categories/JsonGenerator.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
<?php

namespace Photogabble\ConfusableHomoglyphs\Categories;

use Exception;

class JsonGenerator
{

/**
* @var \DateTime
*/
private $sourceDatetime;

/**
* @var array
*/
private $codePointsRanges = [];

/**
* @var array
*/
private $iso15924Aliases = [];

/**
* @var array
*/
private $categories = [];

/**
* Generates the categories JSON data file from the unicode specification
* loaded from the given `$filePathname` string.
*
* @param string $filePathname
* @throws Exception
*/
public function generateFromFile(string $filePathname)
{
if (!file_exists($filePathname)){
throw new Exception('The file found at ['.$filePathname.'] could not be read.');
}
$handle = fopen($filePathname, "r");
if ($handle) {
while (($line = fgets($handle)) !== false) {
$this->parseLine($line);
}
fclose($handle);
} else {
throw new Exception('The file found at ['.$filePathname.'] could not be opened.');
}

sort($this->codePointsRanges);
}

/**
* Parse the given $line into code point range's, alias and category.
*
* @param string $line
*/
private function parseLine(string $line)
{
if (preg_match('/Date: ([12]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])), ((?:(?:([01]?\d|2[0-3]):)?([0-5]?\d):)?([0-5]?\d)) ([A-Z]+)/', $line, $dateMatches) > 0) {
$this->sourceDatetime = new \DateTime($dateMatches[1] . ' ' . $dateMatches[4], new \DateTimeZone($dateMatches[8]));
return;
} unset($dateMatches);

if (preg_match('/([0-9A-F]+)(?:\.\.([0-9A-F]+))?\W+(\w+)\s*#\s*(\w+)/', $line, $matches) < 1) {
return;
}

$codePointRangeFrom = $matches[1];
$codePointRangeTo = $matches[2];
$alias = mb_strtoupper($matches[3]);
$category = $matches[4];

if (! in_array($alias, $this->iso15924Aliases)){
$this->iso15924Aliases[] = $alias;
}

if (! in_array($category, $this->categories)){
$this->categories[] = $category;
}

$this->codePointsRanges[] = [
hexdec($codePointRangeFrom),
hexdec((empty($codePointRangeTo) ? $codePointRangeFrom : $codePointRangeTo)),
array_search($alias, $this->iso15924Aliases, true),
array_search($category, $this->categories, true)
];
}

/**
* Return categories data as an array.
*
* @return array
*/
public function toArray() : array
{
return [
'timestamp' => $this->sourceDatetime->format('c'),
'code_points_ranges' => $this->codePointsRanges,
'categories' => $this->categories,
'iso_15924_aliases' => $this->iso15924Aliases
];
}

/**
* Return categories data as a json string.
*
* @return string
* @throws Exception
*/
public function toJson() : string
{
$json = json_encode($this->toArray());
if ($json === false) {
throw new Exception(json_last_error_msg(), json_last_error());
}
return $json;
}
}
9 changes: 4 additions & 5 deletions src/Confusable.php
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,7 @@ public function isConfusable(string $string, bool $greedy = false, array $prefer
if (in_array($char, $checked)) {
continue;
}
array_push($checked, $char);

$checked[] = $char;
$charAlias = $this->categories->alias($char);
if (in_array($charAlias, $preferredAliases)){
// It's safe if the character might be confusable with homoglyphs from other
Expand All @@ -145,7 +144,7 @@ public function isConfusable(string $string, bool $greedy = false, array $prefer

// The original source uses the Python dictionary get() method which returns a default
// if the key doesn't exist in the dictionary. This solves issue #2.
$found = isset($this->confusablesData[$char]) ? $this->confusablesData[$char] : [];
$found = $this->confusablesData[$char] ?? [];
// Character λ is considered confusable if λ can be confused with a character from
// $preferredAliases, e.g. if 'LATIN', 'ρ' is confusable with 'p' from LATIN.
// if 'LATIN', 'Γ' is not confusable because in all the characters confusable with Γ,
Expand All @@ -157,7 +156,7 @@ public function isConfusable(string $string, bool $greedy = false, array $prefer
$aliases = [];

foreach (preg_split('//u', $d['c'], -1, PREG_SPLIT_NO_EMPTY) as $glyph) {
array_push($aliases, $this->categories->alias($glyph));
$aliases[] = $this->categories->alias($glyph);
}

foreach ($aliases as $a) {
Expand All @@ -183,7 +182,7 @@ public function isConfusable(string $string, bool $greedy = false, array $prefer
if (!$greedy) {
return [$output];
}
array_push($outputs, $output);
$outputs[] = $output;
}
}

Expand Down
110 changes: 110 additions & 0 deletions src/Confusable/JsonGenerator.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
<?php

namespace Photogabble\ConfusableHomoglyphs\Confusable;

use Exception;

class JsonGenerator
{

/**
* @var \DateTime
*/
private $sourceDatetime;

/**
* @var array
*/
private $confusablesMatrix = [];



/**
* Generates the categories JSON data file from the unicode specification
* loaded from the given `$filePathname` string.
*
* @param string $filePathname
* @throws Exception
*/
public function generateFromFile(string $filePathname)
{
if (!file_exists($filePathname)){
throw new Exception('The file found at ['.$filePathname.'] could not be read.');
}
$handle = fopen($filePathname, "r");
if ($handle) {
while (($line = fgets($handle)) !== false) {
$this->parseLine($line);
}
fclose($handle);
} else {
throw new Exception('The file found at ['.$filePathname.'] could not be opened.');
}
}

/**
* Parse the given $line into code point range's, alias and category.
*
* @param string $line
*/
private function parseLine(string $line)
{
if (preg_match('/Date: ([12]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])), ((?:(?:([01]?\d|2[0-3]):)?([0-5]?\d):)?([0-5]?\d)) ([A-Z]+)/', $line, $dateMatches) > 0) {
$this->sourceDatetime = new \DateTime($dateMatches[1] . ' ' . $dateMatches[4], new \DateTimeZone($dateMatches[8]));
return;
} unset($dateMatches);

if (preg_match('/[0-9A-F ]+\s+;\s*[0-9A-F ]+\s+;\s*\w+\s*#\*?\s*\( (.+) → (.+) \) (.+) → (.+)\t#/', $line, $matches) < 1) {
return;
}

$charOne = $matches[1];
$charTwo = $matches[2];
$nameOne = $matches[3];
$nameTwo = $matches[4];

if (! isset($this->confusablesMatrix[$charOne])) {
$this->confusablesMatrix[$charOne] = [];
}

$this->confusablesMatrix[$charOne][] = [
'c' => $charTwo,
'n' => $nameTwo
];


if (! isset($this->confusablesMatrix[$charTwo])) {
$this->confusablesMatrix[$charTwo] = [];
}

$this->confusablesMatrix[$charTwo][] = [
'c' => $charOne,
'n' => $nameOne
];
}

/**
* Return categories data as an array.
*
* @return array
*/
public function toArray() : array
{
return $this->confusablesMatrix;
}

/**
* Return categories data as a json string.
*
* @return string
* @throws Exception
*/
public function toJson() : string
{
$json = json_encode($this->toArray());
if ($json === false) {
throw new Exception(json_last_error_msg(), json_last_error());
}
return $json;
}
}
Loading

0 comments on commit a389c80

Please sign in to comment.