Skip to content

Commit

Permalink
Merge da6037b into e5172cb
Browse files Browse the repository at this point in the history
  • Loading branch information
berryzplus authored Dec 7, 2019
2 parents e5172cb + da6037b commit 1439eee
Show file tree
Hide file tree
Showing 13 changed files with 493 additions and 206 deletions.
4 changes: 4 additions & 0 deletions sakura/sakura.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,7 @@
<ClInclude Include="..\sakura_core\charset\CUnicodeBe.h" />
<ClInclude Include="..\sakura_core\charset\CUtf7.h" />
<ClInclude Include="..\sakura_core\charset\CUtf8.h" />
<ClInclude Include="..\sakura_core\charset\icu4c\CharsetDetector.h" />
<ClInclude Include="..\sakura_core\CHokanMgr.h" />
<ClInclude Include="..\sakura_core\CKeyWordSetMgr.h" />
<ClInclude Include="..\sakura_core\CLoadAgent.h" />
Expand Down Expand Up @@ -413,6 +414,7 @@
<ClInclude Include="..\sakura_core\extmodule\CBregexpDll2.h" />
<ClInclude Include="..\sakura_core\extmodule\CDllHandler.h" />
<ClInclude Include="..\sakura_core\extmodule\CHtmlHelp.h" />
<ClInclude Include="..\sakura_core\extmodule\CIcu4cI18n.h" />
<ClInclude Include="..\sakura_core\extmodule\CMigemo.h" />
<ClInclude Include="..\sakura_core\extmodule\CUxTheme.h" />
<ClInclude Include="..\sakura_core\Funccode_define.h" />
Expand Down Expand Up @@ -647,6 +649,7 @@
<ClCompile Include="..\sakura_core\charset\CUnicodeBe.cpp" />
<ClCompile Include="..\sakura_core\charset\CUtf7.cpp" />
<ClCompile Include="..\sakura_core\charset\CUtf8.cpp" />
<ClCompile Include="..\sakura_core\charset\icu4c\CharsetDetector.cpp" />
<ClCompile Include="..\sakura_core\CHokanMgr.cpp" />
<ClCompile Include="..\sakura_core\CKeyWordSetMgr.cpp" />
<ClCompile Include="..\sakura_core\CLoadAgent.cpp" />
Expand Down Expand Up @@ -776,6 +779,7 @@
<ClCompile Include="..\sakura_core\extmodule\CBregexpDll2.cpp" />
<ClCompile Include="..\sakura_core\extmodule\CDllHandler.cpp" />
<ClCompile Include="..\sakura_core\extmodule\CHtmlHelp.cpp" />
<ClCompile Include="..\sakura_core\extmodule\CIcu4cI18n.cpp" />
<ClCompile Include="..\sakura_core\extmodule\CMigemo.cpp" />
<ClCompile Include="..\sakura_core\extmodule\CUxTheme.cpp" />
<ClCompile Include="..\sakura_core\func\CFuncKeyWnd.cpp" />
Expand Down
15 changes: 15 additions & 0 deletions sakura/sakura.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,9 @@
<Filter Include="Cpp Source Files\uiparts">
<UniqueIdentifier>{930f3f82-ab3f-49e3-af4a-d4f9c2d51f46}</UniqueIdentifier>
</Filter>
<Filter Include="Cpp Source Files\charset\icu4c">
<UniqueIdentifier>{e4629f85-3be8-4dda-80db-1be310929433}</UniqueIdentifier>
</Filter>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\sakura_core\Funccode_define.h">
Expand Down Expand Up @@ -1085,6 +1088,12 @@
<ClInclude Include="..\sakura_core\mem\CPoolResource.h">
<Filter>Cpp Source Files\mem</Filter>
</ClInclude>
<ClInclude Include="..\sakura_core\extmodule\CIcu4cI18n.h">
<Filter>Cpp Source Files\extmodule</Filter>
</ClInclude>
<ClInclude Include="..\sakura_core\charset\icu4c\CharsetDetector.h">
<Filter>Cpp Source Files\charset\icu4c</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<None Include="..\resource\auto_scroll_center.cur">
Expand Down Expand Up @@ -2252,6 +2261,12 @@
<ClCompile Include="..\sakura_core\dlg\CDlgOpenFile_CommonItemDialog.cpp">
<Filter>Cpp Source Files\dlg</Filter>
</ClCompile>
<ClCompile Include="..\sakura_core\extmodule\CIcu4cI18n.cpp">
<Filter>Cpp Source Files\extmodule</Filter>
</ClCompile>
<ClCompile Include="..\sakura_core\charset\icu4c\CharsetDetector.cpp">
<Filter>Cpp Source Files\charset\icu4c</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<Image Include="..\resource\auto_scroll_center.bmp">
Expand Down
2 changes: 2 additions & 0 deletions sakura_core/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ charset/CUnicode.o \
charset/CUnicodeBe.o \
charset/CUtf7.o \
charset/CUtf8.o \
charset/icu4c/CharsetDetector.o \
cmd/CViewCommander.o \
cmd/CViewCommander_Bookmark.o \
cmd/CViewCommander_Clipboard.o \
Expand Down Expand Up @@ -228,6 +229,7 @@ extmodule/CBregexp.o \
extmodule/CBregexpDll2.o \
extmodule/CDllHandler.o \
extmodule/CHtmlHelp.o \
extmodule/CIcu4cI18n.o \
extmodule/CMigemo.o \
extmodule/CUxTheme.o \
func/CFuncKeyWnd.o \
Expand Down
6 changes: 1 addition & 5 deletions sakura_core/_os/CClipboard.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -605,11 +605,7 @@ bool CClipboard::GetClipboradByFormat(CNativeW& mem, const wchar_t* pFormatName,
}else{
ECodeType eMode = (ECodeType)nMode;
if( !IsValidCodeType(eMode) ){
// コード不明と99は自動判別
ECodeType nBomCode = CCodeMediator::DetectUnicodeBom((const char*)pData, nLength);
if( nBomCode != CODE_NONE ){
eMode = nBomCode;
}else{
{
const STypeConfig& type = CEditDoc::GetInstance(0)->m_cDocType.GetDocumentAttribute();
CCodeMediator mediator(type.m_encoding);
eMode = mediator.CheckKanjiCode((const char*)pData, nLength);
Expand Down
200 changes: 29 additions & 171 deletions sakura_core/charset/CCodeMediator.cpp
Original file line number Diff line number Diff line change
@@ -1,147 +1,9 @@
/*! @file */
#include "StdAfx.h"
#include "charset/CCodeMediator.h"
#include "charset/charcode.h"
#include "charset/icu4c/CharsetDetector.h"
#include "charset/CESI.h"
#include "io/CBinaryStream.h"
#include "types/CType.h"

/*!
文字列の先頭にUnicode系BOMが付いているか?
@retval CODE_UNICODE UTF-16 LE
@retval CODE_UTF8 UTF-8
@retval CODE_UNICODEBE UTF-16 BE
@retval CODE_NONE 未検出
@date 2007.08.11 charcode.cpp から移動
*/
ECodeType CCodeMediator::DetectUnicodeBom( const char* pS, const int nLen )
{
uchar_t *pBuf;

if( NULL == pS ){ return CODE_NONE; }

pBuf = (uchar_t *) pS;
if( 2 <= nLen ){
if( pBuf[0] == 0xff && pBuf[1] == 0xfe ){
return CODE_UNICODE;
}
if( pBuf[0] == 0xfe && pBuf[1] == 0xff ){
return CODE_UNICODEBE;
}
if( 3 <= nLen ){
if( pBuf[0] == 0xef && pBuf[1] == 0xbb && pBuf[2] == 0xbf ){
return CODE_UTF8;
}
}
}
#if 0
// 2015.03.05 Moca UTF-7 BOMは無効に変更
// もしデータがASCII互換でUTF-7として正しければ、文字コード比較でUTF-7になるはず
if( 4 <= nLen ){
if( memcmp( pBuf, "+/v", 3 ) == 0
&& ( pBuf[3] == '8' || pBuf[3] == '9' || pBuf[3] == '+' || pBuf[3] == '/' ) ){
return CODE_UTF7;
}
}
#endif
return CODE_NONE;
}

/*!
SJIS, JIS, EUCJP, UTF-8, UTF-7 を判定 (改)
@return SJIS, JIS, EUCJP, UTF-8, UTF-7 の何れかの ID を返す.
@note 適切な検出が行われた場合は、m_dwStatus に CESI_MB_DETECTED フラグが格納される。
*/
ECodeType CCodeMediator::DetectMBCode( CESI* pcesi )
{
// pcesi->m_dwStatus = ESI_NOINFORMATION;

if( pcesi->GetDataLen() < (pcesi->m_apMbcInfo[0]->nSpecific - pcesi->m_apMbcInfo[0]->nPoints) * 2000 ){
// 不正バイトの割合が、全体の 0.05% 未満であることを確認。
// 全体の0.05%ほどの不正バイトは、無視する。
pcesi->SetStatus( ESI_NODETECTED );
return CODE_NONE;
}
if( pcesi->m_apMbcInfo[0]->nPoints <= 0 ){
pcesi->SetStatus( ESI_NODETECTED );
return CODE_NONE;
}

/*
判定状況を確認
*/
pcesi->SetStatus( ESI_MBC_DETECTED );
return pcesi->m_apMbcInfo[0]->eCodeID;
}

/*!
UTF-16 LE/BE を判定.
@retval CODE_UNICODE UTF-16 LE が検出された
@retval CODE_UNICODEBE UTF-16 BE が検出された
@retval 0 UTF-16 LE/BE ともに検出されなかった
*/
ECodeType CCodeMediator::DetectUnicode( CESI* pcesi )
{
// pcesi->m_dwStatus = ESI_NOINFORMATION;

EBOMType ebom_type = pcesi->GetBOMType();
int ndatalen;
int nlinebreak;

if( ebom_type == ESI_BOMTYPE_UNKNOWN ){
pcesi->SetStatus( ESI_NODETECTED );
return CODE_NONE;
}

// 1行の平均桁数が200を超えている場合はUnicode未検出とする
ndatalen = pcesi->GetDataLen();
nlinebreak = pcesi->m_aWcInfo[ebom_type].nSpecific; // 改行数を nlinebreakに取得
if( static_cast<double>(ndatalen) / nlinebreak > 200 ){
pcesi->SetStatus( ESI_NODETECTED );
return CODE_NONE;
}

pcesi->SetStatus( ESI_WC_DETECTED );
return pcesi->m_aWcInfo[ebom_type].eCodeID;
}

/*
日本語コードセット判定
*/
ECodeType CCodeMediator::CheckKanjiCode( CESI* pcesi )
{
ECodeType nret;

/*
判定状況は、
DetectMBCode(), DetectUnicode() 内で
cesi.m_dwStatus に記録する。
*/

if( pcesi == NULL ){
return CODE_DEFAULT;
}
if( pcesi->GetMetaName() != CODE_NONE ){
return pcesi->GetMetaName();
}
nret = DetectUnicode( pcesi );
if( nret != CODE_NONE && pcesi->GetStatus() != ESI_NODETECTED ){
return nret;
}
nret = DetectMBCode( pcesi );
if( nret != CODE_NONE && pcesi->GetStatus() != ESI_NODETECTED ){
return nret;
}

// デフォルト文字コードを返す
return pcesi->m_pEncodingConfig->m_eDefaultCodetype;
}

/*
日本語コードセット判別
Expand All @@ -155,18 +17,22 @@ ECodeType CCodeMediator::CheckKanjiCode( CESI* pcesi )
UTF-7 CODE_UTF7
UnicodeBE CODE_UNICODEBE
*/
ECodeType CCodeMediator::CheckKanjiCode( const char* pBuf, int nBufLen )
ECodeType CCodeMediator::CheckKanjiCode(const char* buff, size_t size) noexcept
{
CESI cesi(*m_pEncodingConfig);
// 0バイトならタイプ別のデフォルト設定
if (size == 0) {
return m_sEncodingConfig.m_eDefaultCodetype;
}

/*
判定状況は、
DetectMBCode(), DetectUnicode() 内で
cesi.m_dwStatus に記録する。
*/
// ICU4CのDLL群が利用できる場合、ICU4Cによる判定を試みる
CharsetDetector csd;
if (csd.IsAvailable()) {
auto code = csd.Detect(std::string_view(buff, size));
if (code != CODE_ERROR) return code;
}

cesi.SetInformation( pBuf, nBufLen/*, CODE_SJIS*/ );
return CheckKanjiCode( &cesi );
CESI cesi(m_sEncodingConfig);
return cesi.CheckKanjiCode(buff, size);
}

/*
Expand All @@ -182,42 +48,34 @@ ECodeType CCodeMediator::CheckKanjiCode( const char* pBuf, int nBufLen )
|| UnicodeBE CODE_UNICODEBE
|| エラー CODE_ERROR
*/
ECodeType CCodeMediator::CheckKanjiCodeOfFile( const WCHAR* pszFile )
ECodeType CCodeMediator::CheckKanjiCodeOfFile(const WCHAR* pszFile)
{
if (!pszFile) {
return CODE_ERROR;
}

// オープン
CBinaryInputStream in(pszFile);
if(!in){
return CODE_ERROR;
}

// データ長取得
int nBufLen = in.GetLength();
if( nBufLen > CheckKanjiCode_MAXREADLENGTH ){
nBufLen = CheckKanjiCode_MAXREADLENGTH;
}

// 0バイトならタイプ別のデフォルト設定
if( 0 == nBufLen ){
return m_pEncodingConfig->m_eDefaultCodetype;
}
auto size = std::min<size_t>(in.GetLength(), CheckKanjiCode_MAXREADLENGTH);

// データ確保
CMemory cMem;
cMem.AllocBuffer(nBufLen);
void* pBuf = cMem.GetRawPtr();
std::unique_ptr<char[]> buff;
if (size > 0)
{
// データ確保
buff = std::make_unique<char[]>(size);

// 読み込み
nBufLen = in.Read(pBuf, nBufLen);
// 読み込み
auto ret = in.Read(buff.get(), size);
}

// クローズ
in.Close();

// 日本語コードセット判別
ECodeType nCodeType = DetectUnicodeBom( reinterpret_cast<const char*>(pBuf), nBufLen );
if( nCodeType == CODE_NONE ){
// Unicode BOM は検出されませんでした.
nCodeType = CheckKanjiCode( reinterpret_cast<const char*>(pBuf), nBufLen );
}

return nCodeType;
return CheckKanjiCode(buff.get(), size);
}
33 changes: 15 additions & 18 deletions sakura_core/charset/CCodeMediator.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,30 +24,27 @@
*/
#pragma once

#include "charset/CESI.h"
class CEditDoc;

class CCodeMediator{
protected:
// CESI.cpp の判定関数をここに移す
static ECodeType DetectMBCode( CESI* pcesi );
static ECodeType DetectUnicode( CESI* pcesi );

#include "types/CType.h" //SEncodingConfig

/*!
* @brief CCodeMediator クラス
*
* 日本語コードセット判別の詳細を隠ぺいするための仲介クラスです。
*/
class CCodeMediator final {
public:

explicit CCodeMediator( const SEncodingConfig &ref ) : m_pEncodingConfig(&ref) { }

static ECodeType DetectUnicodeBom( const char* pS, const int nLen );
explicit CCodeMediator(const SEncodingConfig &encodingConfig) noexcept
: m_sEncodingConfig(encodingConfig)
{
}

/* 日本語コードセット判別 */
ECodeType CheckKanjiCode( const char* pBuf, int nBufLen );
ECodeType CheckKanjiCode(const char* buff, size_t size) noexcept;
/* ファイルの日本語コードセット判別 */
ECodeType CheckKanjiCodeOfFile( const WCHAR* pszFile );

static ECodeType CheckKanjiCode( CESI* pcesi ); // CESI 構造体(?)を外部で構築した場合に使用
ECodeType CheckKanjiCodeOfFile(const WCHAR* pszFile);

private:
const SEncodingConfig* m_pEncodingConfig;
const SEncodingConfig& m_sEncodingConfig;
};

/*[EOF]*/
Loading

0 comments on commit 1439eee

Please sign in to comment.