Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ICU4Cによる文字コード検出機能を追加する #1104

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions sakura/sakura.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,7 @@
<ClInclude Include="..\sakura_core\charset\CUnicodeBe.h" />
<ClInclude Include="..\sakura_core\charset\CUtf7.h" />
<ClInclude Include="..\sakura_core\charset\CUtf8.h" />
<ClInclude Include="..\sakura_core\charset\icu4c\CharsetDetector.h" />
<ClInclude Include="..\sakura_core\CHokanMgr.h" />
<ClInclude Include="..\sakura_core\CKeyWordSetMgr.h" />
<ClInclude Include="..\sakura_core\CLoadAgent.h" />
Expand Down Expand Up @@ -413,6 +414,7 @@
<ClInclude Include="..\sakura_core\extmodule\CBregexpDll2.h" />
<ClInclude Include="..\sakura_core\extmodule\CDllHandler.h" />
<ClInclude Include="..\sakura_core\extmodule\CHtmlHelp.h" />
<ClInclude Include="..\sakura_core\extmodule\CIcu4cI18n.h" />
<ClInclude Include="..\sakura_core\extmodule\CMigemo.h" />
<ClInclude Include="..\sakura_core\extmodule\CUxTheme.h" />
<ClInclude Include="..\sakura_core\Funccode_define.h" />
Expand Down Expand Up @@ -647,6 +649,7 @@
<ClCompile Include="..\sakura_core\charset\CUnicodeBe.cpp" />
<ClCompile Include="..\sakura_core\charset\CUtf7.cpp" />
<ClCompile Include="..\sakura_core\charset\CUtf8.cpp" />
<ClCompile Include="..\sakura_core\charset\icu4c\CharsetDetector.cpp" />
<ClCompile Include="..\sakura_core\CHokanMgr.cpp" />
<ClCompile Include="..\sakura_core\CKeyWordSetMgr.cpp" />
<ClCompile Include="..\sakura_core\CLoadAgent.cpp" />
Expand Down Expand Up @@ -776,6 +779,7 @@
<ClCompile Include="..\sakura_core\extmodule\CBregexpDll2.cpp" />
<ClCompile Include="..\sakura_core\extmodule\CDllHandler.cpp" />
<ClCompile Include="..\sakura_core\extmodule\CHtmlHelp.cpp" />
<ClCompile Include="..\sakura_core\extmodule\CIcu4cI18n.cpp" />
<ClCompile Include="..\sakura_core\extmodule\CMigemo.cpp" />
<ClCompile Include="..\sakura_core\extmodule\CUxTheme.cpp" />
<ClCompile Include="..\sakura_core\func\CFuncKeyWnd.cpp" />
Expand Down
15 changes: 15 additions & 0 deletions sakura/sakura.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,9 @@
<Filter Include="Cpp Source Files\uiparts">
<UniqueIdentifier>{930f3f82-ab3f-49e3-af4a-d4f9c2d51f46}</UniqueIdentifier>
</Filter>
<Filter Include="Cpp Source Files\charset\icu4c">
<UniqueIdentifier>{e4629f85-3be8-4dda-80db-1be310929433}</UniqueIdentifier>
</Filter>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\sakura_core\Funccode_define.h">
Expand Down Expand Up @@ -1085,6 +1088,12 @@
<ClInclude Include="..\sakura_core\mem\CPoolResource.h">
<Filter>Cpp Source Files\mem</Filter>
</ClInclude>
<ClInclude Include="..\sakura_core\extmodule\CIcu4cI18n.h">
<Filter>Cpp Source Files\extmodule</Filter>
</ClInclude>
<ClInclude Include="..\sakura_core\charset\icu4c\CharsetDetector.h">
<Filter>Cpp Source Files\charset\icu4c</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<None Include="..\resource\auto_scroll_center.cur">
Expand Down Expand Up @@ -2252,6 +2261,12 @@
<ClCompile Include="..\sakura_core\dlg\CDlgOpenFile_CommonItemDialog.cpp">
<Filter>Cpp Source Files\dlg</Filter>
</ClCompile>
<ClCompile Include="..\sakura_core\extmodule\CIcu4cI18n.cpp">
<Filter>Cpp Source Files\extmodule</Filter>
</ClCompile>
<ClCompile Include="..\sakura_core\charset\icu4c\CharsetDetector.cpp">
<Filter>Cpp Source Files\charset\icu4c</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<Image Include="..\resource\auto_scroll_center.bmp">
Expand Down
2 changes: 2 additions & 0 deletions sakura_core/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ charset/CUnicode.o \
charset/CUnicodeBe.o \
charset/CUtf7.o \
charset/CUtf8.o \
charset/icu4c/CharsetDetector.o \
cmd/CViewCommander.o \
cmd/CViewCommander_Bookmark.o \
cmd/CViewCommander_Clipboard.o \
Expand Down Expand Up @@ -228,6 +229,7 @@ extmodule/CBregexp.o \
extmodule/CBregexpDll2.o \
extmodule/CDllHandler.o \
extmodule/CHtmlHelp.o \
extmodule/CIcu4cI18n.o \
extmodule/CMigemo.o \
extmodule/CUxTheme.o \
func/CFuncKeyWnd.o \
Expand Down
6 changes: 1 addition & 5 deletions sakura_core/_os/CClipboard.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -605,11 +605,7 @@ bool CClipboard::GetClipboradByFormat(CNativeW& mem, const wchar_t* pFormatName,
}else{
ECodeType eMode = (ECodeType)nMode;
if( !IsValidCodeType(eMode) ){
// コード不明と99は自動判別
ECodeType nBomCode = CCodeMediator::DetectUnicodeBom((const char*)pData, nLength);
if( nBomCode != CODE_NONE ){
eMode = nBomCode;
}else{
{
const STypeConfig& type = CEditDoc::GetInstance(0)->m_cDocType.GetDocumentAttribute();
CCodeMediator mediator(type.m_encoding);
eMode = mediator.CheckKanjiCode((const char*)pData, nLength);
Expand Down
200 changes: 29 additions & 171 deletions sakura_core/charset/CCodeMediator.cpp
Original file line number Diff line number Diff line change
@@ -1,147 +1,9 @@
/*! @file */
#include "StdAfx.h"
#include "charset/CCodeMediator.h"
#include "charset/charcode.h"
#include "charset/icu4c/CharsetDetector.h"
#include "charset/CESI.h"
#include "io/CBinaryStream.h"
#include "types/CType.h"

/*!
文字列の先頭にUnicode系BOMが付いているか?

@retval CODE_UNICODE UTF-16 LE
@retval CODE_UTF8 UTF-8
@retval CODE_UNICODEBE UTF-16 BE
@retval CODE_NONE 未検出

@date 2007.08.11 charcode.cpp から移動
*/
ECodeType CCodeMediator::DetectUnicodeBom( const char* pS, const int nLen )
{
uchar_t *pBuf;

if( NULL == pS ){ return CODE_NONE; }

pBuf = (uchar_t *) pS;
if( 2 <= nLen ){
if( pBuf[0] == 0xff && pBuf[1] == 0xfe ){
return CODE_UNICODE;
}
if( pBuf[0] == 0xfe && pBuf[1] == 0xff ){
return CODE_UNICODEBE;
}
if( 3 <= nLen ){
if( pBuf[0] == 0xef && pBuf[1] == 0xbb && pBuf[2] == 0xbf ){
return CODE_UTF8;
}
}
}
#if 0
// 2015.03.05 Moca UTF-7 BOMは無効に変更
// もしデータがASCII互換でUTF-7として正しければ、文字コード比較でUTF-7になるはず
if( 4 <= nLen ){
if( memcmp( pBuf, "+/v", 3 ) == 0
&& ( pBuf[3] == '8' || pBuf[3] == '9' || pBuf[3] == '+' || pBuf[3] == '/' ) ){
return CODE_UTF7;
}
}
#endif
return CODE_NONE;
}

/*!
SJIS, JIS, EUCJP, UTF-8, UTF-7 を判定 (改)

@return SJIS, JIS, EUCJP, UTF-8, UTF-7 の何れかの ID を返す.

@note 適切な検出が行われた場合は、m_dwStatus に CESI_MB_DETECTED フラグが格納される。
*/
ECodeType CCodeMediator::DetectMBCode( CESI* pcesi )
{
// pcesi->m_dwStatus = ESI_NOINFORMATION;

if( pcesi->GetDataLen() < (pcesi->m_apMbcInfo[0]->nSpecific - pcesi->m_apMbcInfo[0]->nPoints) * 2000 ){
// 不正バイトの割合が、全体の 0.05% 未満であることを確認。
// 全体の0.05%ほどの不正バイトは、無視する。
pcesi->SetStatus( ESI_NODETECTED );
return CODE_NONE;
}
if( pcesi->m_apMbcInfo[0]->nPoints <= 0 ){
pcesi->SetStatus( ESI_NODETECTED );
return CODE_NONE;
}

/*
判定状況を確認
*/
pcesi->SetStatus( ESI_MBC_DETECTED );
return pcesi->m_apMbcInfo[0]->eCodeID;
}

/*!
UTF-16 LE/BE を判定.

@retval CODE_UNICODE UTF-16 LE が検出された
@retval CODE_UNICODEBE UTF-16 BE が検出された
@retval 0 UTF-16 LE/BE ともに検出されなかった

*/
ECodeType CCodeMediator::DetectUnicode( CESI* pcesi )
{
// pcesi->m_dwStatus = ESI_NOINFORMATION;

EBOMType ebom_type = pcesi->GetBOMType();
int ndatalen;
int nlinebreak;

if( ebom_type == ESI_BOMTYPE_UNKNOWN ){
pcesi->SetStatus( ESI_NODETECTED );
return CODE_NONE;
}

// 1行の平均桁数が200を超えている場合はUnicode未検出とする
ndatalen = pcesi->GetDataLen();
nlinebreak = pcesi->m_aWcInfo[ebom_type].nSpecific; // 改行数を nlinebreakに取得
if( static_cast<double>(ndatalen) / nlinebreak > 200 ){
pcesi->SetStatus( ESI_NODETECTED );
return CODE_NONE;
}

pcesi->SetStatus( ESI_WC_DETECTED );
return pcesi->m_aWcInfo[ebom_type].eCodeID;
}

/*
日本語コードセット判定
*/
ECodeType CCodeMediator::CheckKanjiCode( CESI* pcesi )
{
ECodeType nret;

/*
判定状況は、
DetectMBCode(), DetectUnicode() 内で
cesi.m_dwStatus に記録する。
*/

if( pcesi == NULL ){
return CODE_DEFAULT;
}
if( pcesi->GetMetaName() != CODE_NONE ){
return pcesi->GetMetaName();
}
nret = DetectUnicode( pcesi );
if( nret != CODE_NONE && pcesi->GetStatus() != ESI_NODETECTED ){
return nret;
}
nret = DetectMBCode( pcesi );
if( nret != CODE_NONE && pcesi->GetStatus() != ESI_NODETECTED ){
return nret;
}

// デフォルト文字コードを返す
return pcesi->m_pEncodingConfig->m_eDefaultCodetype;
}

/*
日本語コードセット判別
Expand All @@ -155,18 +17,22 @@ ECodeType CCodeMediator::CheckKanjiCode( CESI* pcesi )
UTF-7 CODE_UTF7
UnicodeBE CODE_UNICODEBE
*/
ECodeType CCodeMediator::CheckKanjiCode( const char* pBuf, int nBufLen )
ECodeType CCodeMediator::CheckKanjiCode(const char* buff, size_t size) noexcept
{
CESI cesi(*m_pEncodingConfig);
// 0バイトならタイプ別のデフォルト設定
if (size == 0) {
return m_sEncodingConfig.m_eDefaultCodetype;
}

/*
判定状況は、
DetectMBCode(), DetectUnicode() 内で
cesi.m_dwStatus に記録する。
*/
// ICU4CのDLL群が利用できる場合、ICU4Cによる判定を試みる
CharsetDetector csd;
if (csd.IsAvailable()) {
auto code = csd.Detect(std::string_view(buff, size));
if (code != CODE_ERROR) return code;
}

cesi.SetInformation( pBuf, nBufLen/*, CODE_SJIS*/ );
return CheckKanjiCode( &cesi );
CESI cesi(m_sEncodingConfig);
return cesi.CheckKanjiCode(buff, size);
}

/*
Expand All @@ -182,42 +48,34 @@ ECodeType CCodeMediator::CheckKanjiCode( const char* pBuf, int nBufLen )
|| UnicodeBE CODE_UNICODEBE
|| エラー CODE_ERROR
*/
ECodeType CCodeMediator::CheckKanjiCodeOfFile( const WCHAR* pszFile )
ECodeType CCodeMediator::CheckKanjiCodeOfFile(const WCHAR* pszFile)
{
if (!pszFile) {
return CODE_ERROR;
}

// オープン
CBinaryInputStream in(pszFile);
if(!in){
return CODE_ERROR;
}

// データ長取得
int nBufLen = in.GetLength();
if( nBufLen > CheckKanjiCode_MAXREADLENGTH ){
nBufLen = CheckKanjiCode_MAXREADLENGTH;
}

// 0バイトならタイプ別のデフォルト設定
if( 0 == nBufLen ){
return m_pEncodingConfig->m_eDefaultCodetype;
}
auto size = std::min<size_t>(in.GetLength(), CheckKanjiCode_MAXREADLENGTH);

// データ確保
CMemory cMem;
cMem.AllocBuffer(nBufLen);
void* pBuf = cMem.GetRawPtr();
std::unique_ptr<char[]> buff;
if (size > 0)
{
// データ確保
buff = std::make_unique<char[]>(size);
beru marked this conversation as resolved.
Show resolved Hide resolved

// 読み込み
nBufLen = in.Read(pBuf, nBufLen);
// 読み込み
auto ret = in.Read(buff.get(), size);
}

// クローズ
in.Close();

// 日本語コードセット判別
ECodeType nCodeType = DetectUnicodeBom( reinterpret_cast<const char*>(pBuf), nBufLen );
if( nCodeType == CODE_NONE ){
// Unicode BOM は検出されませんでした.
nCodeType = CheckKanjiCode( reinterpret_cast<const char*>(pBuf), nBufLen );
}

return nCodeType;
return CheckKanjiCode(buff.get(), size);
}
33 changes: 15 additions & 18 deletions sakura_core/charset/CCodeMediator.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,30 +24,27 @@
*/
#pragma once

#include "charset/CESI.h"
class CEditDoc;

class CCodeMediator{
protected:
// CESI.cpp の判定関数をここに移す
static ECodeType DetectMBCode( CESI* pcesi );
static ECodeType DetectUnicode( CESI* pcesi );

#include "types/CType.h" //SEncodingConfig

/*!
* @brief CCodeMediator クラス
*
* 日本語コードセット判別の詳細を隠ぺいするための仲介クラスです。
*/
class CCodeMediator final {
public:

explicit CCodeMediator( const SEncodingConfig &ref ) : m_pEncodingConfig(&ref) { }

static ECodeType DetectUnicodeBom( const char* pS, const int nLen );
explicit CCodeMediator(const SEncodingConfig &encodingConfig) noexcept
: m_sEncodingConfig(encodingConfig)
{
}

/* 日本語コードセット判別 */
ECodeType CheckKanjiCode( const char* pBuf, int nBufLen );
ECodeType CheckKanjiCode(const char* buff, size_t size) noexcept;
/* ファイルの日本語コードセット判別 */
ECodeType CheckKanjiCodeOfFile( const WCHAR* pszFile );

static ECodeType CheckKanjiCode( CESI* pcesi ); // CESI 構造体(?)を外部で構築した場合に使用
ECodeType CheckKanjiCodeOfFile(const WCHAR* pszFile);

private:
const SEncodingConfig* m_pEncodingConfig;
const SEncodingConfig& m_sEncodingConfig;
};

/*[EOF]*/
Loading