Skip to content

Commit

Permalink
Add getLanguages() and loaded models restrictions for static calls
Browse files Browse the repository at this point in the history
- Add manual
- Enforce tests
  • Loading branch information
landrok committed Sep 26, 2020
1 parent 7be487f commit af3df0b
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 20 deletions.
44 changes: 43 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,15 @@ Table of contents
- [API Methods](#api-methods)
- [evaluate()](#evaluate)
- [getLanguage()](#getlanguage)
- [getLanguages()](#getLanguages)
- [getScores()](#getscores)
- [getSupportedLanguages()](#getsupportedlanguages)
- [getText()](#gettext)
- [options](#options)
- [For one-liners only](#for-one-liners-only)



Features
--------

Expand All @@ -32,7 +35,7 @@ Features
- Learning steps are already done, library is ready to use
- Small code, small footprint
- N-grams algorithm
- Supports PHP 5.4, 5.5, 5.6, 7.0, 7.1, 7.2, 7.3, 7.4 and HHVM
- Supports PHP 5.4, 5.5, 5.6, 7.0, 7.1, 7.2, 7.3, 7.4, 8.0 and HHVM


Install
Expand Down Expand Up @@ -160,6 +163,19 @@ $detector->getLanguage(); // Returns 'en'
```
________________________________________________________________________

#### getLanguages()

__Type__ *array*

A list of loaded models that will be evaluated.

__Example__

```php
$detector->getLanguages(); // Returns something like ['de', 'en', 'fr']
```
________________________________________________________________________

#### getScores()

__Type__ *array*
Expand Down Expand Up @@ -222,6 +238,26 @@ $detector->getText();
```
________________________________________________________________________

#### Options

__Type__ *\LanguageDetector\LanguageDetector*

For even better performance, loaded models can be specified explicitly.

__Example__

```php

$text = 'My tailor is rich and Alison is in the kitchen with Bob.';

$detector = new LanguageDetector(null, ['en', 'fr', 'de']);

$language = $detector->evaluate($text);

echo $language; // Prints something like 'en'
```
________________________________________________________________________

#### For one-liners only

__Type__ *\LanguageDetector\LanguageDetector*
Expand Down Expand Up @@ -260,5 +296,11 @@ print_r($detector->getSupportedLanguages());
// The last evaluated string
echo $detector->getText();

// Limit loaded languages for even better performance
echo LanguageDetector\LanguageDetector::detect(
'My tailor is rich and Alison is in the kitchen with Bob.',
['en', 'de', 'fr', 'es']
); // en

```
________________________________________________________________________
57 changes: 40 additions & 17 deletions src/LanguageDetector/LanguageDetector.php
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

/**
* LanguageDetector is the entry point for the detecting process.
*/
*/
class LanguageDetector
{
/**
Expand All @@ -41,25 +41,27 @@ class LanguageDetector

/**
* Configure all subset languages
*
*
* @param string $dir A directory where subsets are.
* @param array $languages Language codes to load models for. By default, all languages are loaded.
*/
public function __construct($dir = null, $languages = null)
public function __construct($dir = null, array $languages = [])
{
$datadir = null === $dir
? __DIR__ . '/subsets' : rtrim($dir, '/');

foreach (glob($datadir . '/*') as $file) {
if (! $languages || in_array(basename($file), $languages)) {
if (!count($languages)
|| in_array(basename($file), $languages)
) {
$this->languages[basename($file)] = new Language($file);
}
}
}

/**
* Evaluates that a string matches a language
*
*
* @param string $text
* @return \LanguageDetector\LanguageDetector
* @throws \InvalidArgumentException if $text is not a string
Expand Down Expand Up @@ -87,15 +89,25 @@ public function evaluate($text): self

/**
* Static call for oneliners
*
*
* @param string $text
* @param array $languages Language codes to load models for. By
* default, all languages are loaded.
* @return \LanguageDetector\LanguageDetector
* @api
*/
public static function detect($text): self
public static function detect($text, array $languages = []): self
{
if (is_null(self::$detector)) {
self::$detector = new self();
// All specified models have been loaded
$diff = count($languages)
? array_diff(
self::$detector->getLanguages(),
$languages
)
: [];

if (is_null(self::$detector) || count($diff)) {
self::$detector = new self(null, $languages);
}

return self::$detector->evaluate($text);
Expand Down Expand Up @@ -127,9 +139,20 @@ public function getLanguage($code = null): Language
return $this->languages[$code];
}

/**
* Get loaded languages
*
* @return []string An array of ISO codes
* @api
*/
public function getLanguages(): array
{
return array_keys($this->languages);
}

/**
* Get all scored subsets
*
*
* @return array An array of ISO codes => scores
* @throws \Exception if nothing has been evaluated
* @api
Expand All @@ -145,7 +168,7 @@ public function getScores(): array

/**
* Get all supported languages
*
*
* @return array An array of ISO codes
* @api
*/
Expand All @@ -156,7 +179,7 @@ public function getSupportedLanguages(): array

/**
* Get evaluated text
*
*
* @return string
* @api
*/
Expand All @@ -167,7 +190,7 @@ public function getText(): string

/**
* Get best result when detector is used as a string
*
*
* @return string
*/
public function __toString(): string
Expand All @@ -177,14 +200,14 @@ public function __toString(): string

/**
* Evaluate probabilities for one language
*
*
* @param array $chunks
* @return \Closure An evaluator
*/
private function calculate(array $chunks): callable
{
return function($language, $code) use ($chunks) {
$this->scores[$code] =
$this->scores[$code] =
array_sum(
array_intersect_key(
$language->getFreq(),
Expand All @@ -197,15 +220,15 @@ private function calculate(array $chunks): callable

/**
* Chunk a text
*
*
* @return array
*/
private function chunk(): array
{
$chunks = [];
$len = mb_strlen($this->text);

// Chunk sizes
// Chunk sizes
for ($i = 0; $i < 3; $i++) {
for ($j = 0; $j < $len; $j++) {
if ($len > $j + $i) {
Expand Down
27 changes: 25 additions & 2 deletions tests/LanguageDetector/LanguageSubsetTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
namespace LanguageDetectorTest;

use LanguageDetector\Language;
use LanguageDetector\LanguageDetector;
use PHPUnit\Framework\TestCase;
use InvalidArgumentException;

Expand Down Expand Up @@ -74,7 +75,7 @@ public function getLanguageSubsetScenarios()

/**
* Tests that subset are loaded
*
*
* @dataProvider getLanguageSubsetScenarios
*/
public function testSubsetContents($code, $expected = null)
Expand All @@ -99,7 +100,7 @@ public function testSubsetContents($code, $expected = null)

/**
* Tests that getCode return s a valid code
*
*
* @dataProvider getLanguageSubsetScenarios
*/
public function testSubsetGetCode($code, $expected = null)
Expand All @@ -126,4 +127,26 @@ public function testSubsetGetCode($code, $expected = null)
(string)$language
);
}

/**
* Tests that a limited number of subsets has been loaded
*/
public function testLimitLoadedSubsets()
{
$subsets = ['da', 'en', 'no', 'sv'];

$language = new LanguageDetector(null, $subsets);

$this->assertEquals(
$subsets,
$language->getLanguages()
);

$language = LanguageDetector::detect('ok', $subsets);

$this->assertEquals(
$subsets,
$language->getLanguages()
);
}
}

0 comments on commit af3df0b

Please sign in to comment.