From 00410f22fee1242fa1756824ccfa51d4778873b3 Mon Sep 17 00:00:00 2001 From: berryplus Date: Fri, 29 Nov 2019 22:55:19 +0900 Subject: [PATCH 1/4] =?UTF-8?q?CCodeMediator=E3=83=AA=E3=83=95=E3=82=A1?= =?UTF-8?q?=E3=82=AF=E3=82=BF=E3=83=AA=E3=83=B3=E3=82=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CCodeMediator.hからCESI.h依存を追い出して仲介者としての役割を果たせるようにする。 --- sakura_core/_os/CClipboard.cpp | 6 +- sakura_core/charset/CCodeMediator.cpp | 194 +++----------------------- sakura_core/charset/CCodeMediator.h | 33 ++--- sakura_core/charset/CESI.cpp | 101 +++++++++++++- sakura_core/charset/CESI.h | 43 +++++- sakura_core/io/CFileLoad.cpp | 10 +- 6 files changed, 180 insertions(+), 207 deletions(-) diff --git a/sakura_core/_os/CClipboard.cpp b/sakura_core/_os/CClipboard.cpp index a6a6e7fade..3673c49488 100644 --- a/sakura_core/_os/CClipboard.cpp +++ b/sakura_core/_os/CClipboard.cpp @@ -605,11 +605,7 @@ bool CClipboard::GetClipboradByFormat(CNativeW& mem, const wchar_t* pFormatName, }else{ ECodeType eMode = (ECodeType)nMode; if( !IsValidCodeType(eMode) ){ - // コード不明と99は自動判別 - ECodeType nBomCode = CCodeMediator::DetectUnicodeBom((const char*)pData, nLength); - if( nBomCode != CODE_NONE ){ - eMode = nBomCode; - }else{ + { const STypeConfig& type = CEditDoc::GetInstance(0)->m_cDocType.GetDocumentAttribute(); CCodeMediator mediator(type.m_encoding); eMode = mediator.CheckKanjiCode((const char*)pData, nLength); diff --git a/sakura_core/charset/CCodeMediator.cpp b/sakura_core/charset/CCodeMediator.cpp index 2d1515f177..ebfa41b87f 100644 --- a/sakura_core/charset/CCodeMediator.cpp +++ b/sakura_core/charset/CCodeMediator.cpp @@ -1,147 +1,8 @@ /*! @file */ #include "StdAfx.h" #include "charset/CCodeMediator.h" -#include "charset/charcode.h" #include "charset/CESI.h" #include "io/CBinaryStream.h" -#include "types/CType.h" - -/*! - 文字列の先頭にUnicode系BOMが付いているか? - - @retval CODE_UNICODE UTF-16 LE - @retval CODE_UTF8 UTF-8 - @retval CODE_UNICODEBE UTF-16 BE - @retval CODE_NONE 未検出 - - @date 2007.08.11 charcode.cpp から移動 -*/ -ECodeType CCodeMediator::DetectUnicodeBom( const char* pS, const int nLen ) -{ - uchar_t *pBuf; - - if( NULL == pS ){ return CODE_NONE; } - - pBuf = (uchar_t *) pS; - if( 2 <= nLen ){ - if( pBuf[0] == 0xff && pBuf[1] == 0xfe ){ - return CODE_UNICODE; - } - if( pBuf[0] == 0xfe && pBuf[1] == 0xff ){ - return CODE_UNICODEBE; - } - if( 3 <= nLen ){ - if( pBuf[0] == 0xef && pBuf[1] == 0xbb && pBuf[2] == 0xbf ){ - return CODE_UTF8; - } - } - } -#if 0 -// 2015.03.05 Moca UTF-7 BOMは無効に変更 -// もしデータがASCII互換でUTF-7として正しければ、文字コード比較でUTF-7になるはず - if( 4 <= nLen ){ - if( memcmp( pBuf, "+/v", 3 ) == 0 - && ( pBuf[3] == '8' || pBuf[3] == '9' || pBuf[3] == '+' || pBuf[3] == '/' ) ){ - return CODE_UTF7; - } - } -#endif - return CODE_NONE; -} - -/*! - SJIS, JIS, EUCJP, UTF-8, UTF-7 を判定 (改) - - @return SJIS, JIS, EUCJP, UTF-8, UTF-7 の何れかの ID を返す. - - @note 適切な検出が行われた場合は、m_dwStatus に CESI_MB_DETECTED フラグが格納される。 -*/ -ECodeType CCodeMediator::DetectMBCode( CESI* pcesi ) -{ -// pcesi->m_dwStatus = ESI_NOINFORMATION; - - if( pcesi->GetDataLen() < (pcesi->m_apMbcInfo[0]->nSpecific - pcesi->m_apMbcInfo[0]->nPoints) * 2000 ){ - // 不正バイトの割合が、全体の 0.05% 未満であることを確認。 - // 全体の0.05%ほどの不正バイトは、無視する。 - pcesi->SetStatus( ESI_NODETECTED ); - return CODE_NONE; - } - if( pcesi->m_apMbcInfo[0]->nPoints <= 0 ){ - pcesi->SetStatus( ESI_NODETECTED ); - return CODE_NONE; - } - - /* - 判定状況を確認 - */ - pcesi->SetStatus( ESI_MBC_DETECTED ); - return pcesi->m_apMbcInfo[0]->eCodeID; -} - -/*! - UTF-16 LE/BE を判定. - - @retval CODE_UNICODE UTF-16 LE が検出された - @retval CODE_UNICODEBE UTF-16 BE が検出された - @retval 0 UTF-16 LE/BE ともに検出されなかった - -*/ -ECodeType CCodeMediator::DetectUnicode( CESI* pcesi ) -{ -// pcesi->m_dwStatus = ESI_NOINFORMATION; - - EBOMType ebom_type = pcesi->GetBOMType(); - int ndatalen; - int nlinebreak; - - if( ebom_type == ESI_BOMTYPE_UNKNOWN ){ - pcesi->SetStatus( ESI_NODETECTED ); - return CODE_NONE; - } - - // 1行の平均桁数が200を超えている場合はUnicode未検出とする - ndatalen = pcesi->GetDataLen(); - nlinebreak = pcesi->m_aWcInfo[ebom_type].nSpecific; // 改行数を nlinebreakに取得 - if( static_cast(ndatalen) / nlinebreak > 200 ){ - pcesi->SetStatus( ESI_NODETECTED ); - return CODE_NONE; - } - - pcesi->SetStatus( ESI_WC_DETECTED ); - return pcesi->m_aWcInfo[ebom_type].eCodeID; -} - -/* - 日本語コードセット判定 -*/ -ECodeType CCodeMediator::CheckKanjiCode( CESI* pcesi ) -{ - ECodeType nret; - - /* - 判定状況は、 - DetectMBCode(), DetectUnicode() 内で - cesi.m_dwStatus に記録する。 - */ - - if( pcesi == NULL ){ - return CODE_DEFAULT; - } - if( pcesi->GetMetaName() != CODE_NONE ){ - return pcesi->GetMetaName(); - } - nret = DetectUnicode( pcesi ); - if( nret != CODE_NONE && pcesi->GetStatus() != ESI_NODETECTED ){ - return nret; - } - nret = DetectMBCode( pcesi ); - if( nret != CODE_NONE && pcesi->GetStatus() != ESI_NODETECTED ){ - return nret; - } - - // デフォルト文字コードを返す - return pcesi->m_pEncodingConfig->m_eDefaultCodetype; -} /* 日本語コードセット判別 @@ -155,18 +16,15 @@ ECodeType CCodeMediator::CheckKanjiCode( CESI* pcesi ) UTF-7 CODE_UTF7 UnicodeBE CODE_UNICODEBE */ -ECodeType CCodeMediator::CheckKanjiCode( const char* pBuf, int nBufLen ) +ECodeType CCodeMediator::CheckKanjiCode(const char* buff, size_t size) noexcept { - CESI cesi(*m_pEncodingConfig); - - /* - 判定状況は、 - DetectMBCode(), DetectUnicode() 内で - cesi.m_dwStatus に記録する。 - */ + // 0バイトならタイプ別のデフォルト設定 + if (size == 0) { + return m_sEncodingConfig.m_eDefaultCodetype; + } - cesi.SetInformation( pBuf, nBufLen/*, CODE_SJIS*/ ); - return CheckKanjiCode( &cesi ); + CESI cesi(m_sEncodingConfig); + return cesi.CheckKanjiCode(buff, size); } /* @@ -182,8 +40,12 @@ ECodeType CCodeMediator::CheckKanjiCode( const char* pBuf, int nBufLen ) || UnicodeBE CODE_UNICODEBE || エラー CODE_ERROR */ -ECodeType CCodeMediator::CheckKanjiCodeOfFile( const WCHAR* pszFile ) +ECodeType CCodeMediator::CheckKanjiCodeOfFile(const WCHAR* pszFile) { + if (!pszFile) { + return CODE_ERROR; + } + // オープン CBinaryInputStream in(pszFile); if(!in){ @@ -191,33 +53,21 @@ ECodeType CCodeMediator::CheckKanjiCodeOfFile( const WCHAR* pszFile ) } // データ長取得 - int nBufLen = in.GetLength(); - if( nBufLen > CheckKanjiCode_MAXREADLENGTH ){ - nBufLen = CheckKanjiCode_MAXREADLENGTH; - } - - // 0バイトならタイプ別のデフォルト設定 - if( 0 == nBufLen ){ - return m_pEncodingConfig->m_eDefaultCodetype; - } + auto size = std::min(in.GetLength(), CheckKanjiCode_MAXREADLENGTH); - // データ確保 - CMemory cMem; - cMem.AllocBuffer(nBufLen); - void* pBuf = cMem.GetRawPtr(); + std::unique_ptr buff; + if (size > 0) + { + // データ確保 + buff = std::make_unique(size); - // 読み込み - nBufLen = in.Read(pBuf, nBufLen); + // 読み込み + auto ret = in.Read(buff.get(), size); + } // クローズ in.Close(); // 日本語コードセット判別 - ECodeType nCodeType = DetectUnicodeBom( reinterpret_cast(pBuf), nBufLen ); - if( nCodeType == CODE_NONE ){ - // Unicode BOM は検出されませんでした. - nCodeType = CheckKanjiCode( reinterpret_cast(pBuf), nBufLen ); - } - - return nCodeType; + return CheckKanjiCode(buff.get(), size); } diff --git a/sakura_core/charset/CCodeMediator.h b/sakura_core/charset/CCodeMediator.h index 48348d9c0d..6d7af2702b 100644 --- a/sakura_core/charset/CCodeMediator.h +++ b/sakura_core/charset/CCodeMediator.h @@ -24,30 +24,27 @@ */ #pragma once -#include "charset/CESI.h" -class CEditDoc; - -class CCodeMediator{ -protected: - // CESI.cpp の判定関数をここに移す - static ECodeType DetectMBCode( CESI* pcesi ); - static ECodeType DetectUnicode( CESI* pcesi ); - +#include "types/CType.h" //SEncodingConfig + +/*! + * @brief CCodeMediator クラス + * + * 日本語コードセット判別の詳細を隠ぺいするための仲介クラスです。 + */ +class CCodeMediator final { public: - - explicit CCodeMediator( const SEncodingConfig &ref ) : m_pEncodingConfig(&ref) { } - - static ECodeType DetectUnicodeBom( const char* pS, const int nLen ); + explicit CCodeMediator(const SEncodingConfig &encodingConfig) noexcept + : m_sEncodingConfig(encodingConfig) + { + } /* 日本語コードセット判別 */ - ECodeType CheckKanjiCode( const char* pBuf, int nBufLen ); + ECodeType CheckKanjiCode(const char* buff, size_t size) noexcept; /* ファイルの日本語コードセット判別 */ - ECodeType CheckKanjiCodeOfFile( const WCHAR* pszFile ); - - static ECodeType CheckKanjiCode( CESI* pcesi ); // CESI 構造体(?)を外部で構築した場合に使用 + ECodeType CheckKanjiCodeOfFile(const WCHAR* pszFile); private: - const SEncodingConfig* m_pEncodingConfig; + const SEncodingConfig& m_sEncodingConfig; }; /*[EOF]*/ diff --git a/sakura_core/charset/CESI.cpp b/sakura_core/charset/CESI.cpp index 1bd905c234..79d73c04a7 100644 --- a/sakura_core/charset/CESI.cpp +++ b/sakura_core/charset/CESI.cpp @@ -1134,6 +1134,103 @@ ECodeType CESI::AutoDetectByCoding( const char* pBuf, int nSize ) return CODE_NONE; } +/*! + SJIS, JIS, EUCJP, UTF-8, UTF-7 を判定 (改) + + @return SJIS, JIS, EUCJP, UTF-8, UTF-7 の何れかの ID を返す. + + @note 適切な検出が行われた場合は、m_dwStatus に CESI_MB_DETECTED フラグが格納される。 +*/ +static ECodeType DetectMBCode( CESI* pcesi ) +{ +// pcesi->m_dwStatus = ESI_NOINFORMATION; + + if( pcesi->GetDataLen() < (pcesi->m_apMbcInfo[0]->nSpecific - pcesi->m_apMbcInfo[0]->nPoints) * 2000 ){ + // 不正バイトの割合が、全体の 0.05% 未満であることを確認。 + // 全体の0.05%ほどの不正バイトは、無視する。 + pcesi->SetStatus( ESI_NODETECTED ); + return CODE_NONE; + } + if( pcesi->m_apMbcInfo[0]->nPoints <= 0 ){ + pcesi->SetStatus( ESI_NODETECTED ); + return CODE_NONE; + } + + /* + 判定状況を確認 + */ + pcesi->SetStatus( ESI_MBC_DETECTED ); + return pcesi->m_apMbcInfo[0]->eCodeID; +} + +/*! + UTF-16 LE/BE を判定. + + @retval CODE_UNICODE UTF-16 LE が検出された + @retval CODE_UNICODEBE UTF-16 BE が検出された + @retval 0 UTF-16 LE/BE ともに検出されなかった + +*/ +static ECodeType DetectUnicode( CESI* pcesi ) +{ +// pcesi->m_dwStatus = ESI_NOINFORMATION; + + EBOMType ebom_type = pcesi->GetBOMType(); + int ndatalen; + int nlinebreak; + + if( ebom_type == ESI_BOMTYPE_UNKNOWN ){ + pcesi->SetStatus( ESI_NODETECTED ); + return CODE_NONE; + } + + // 1行の平均桁数が200を超えている場合はUnicode未検出とする + ndatalen = pcesi->GetDataLen(); + nlinebreak = pcesi->m_aWcInfo[ebom_type].nSpecific; // 改行数を nlinebreakに取得 + if( static_cast(ndatalen) / nlinebreak > 200 ){ + pcesi->SetStatus( ESI_NODETECTED ); + return CODE_NONE; + } + + pcesi->SetStatus( ESI_WC_DETECTED ); + return pcesi->m_aWcInfo[ebom_type].eCodeID; +} + +/* + 日本語コードセット判定 +*/ +ECodeType CESI::CheckKanjiCode(const char* pBuf, size_t nBufLen) noexcept +{ + + // 日本語コードセット判別 + ECodeType nCodeType = DetectUnicodeBom(pBuf, nBufLen); + if (nCodeType != CODE_NONE) { + return nCodeType; + } + + /* + 判定状況は、 + DetectMBCode(), DetectUnicode() 内で + cesi.m_dwStatus に記録する。 + */ + SetInformation(pBuf, nBufLen); + + if( GetMetaName() != CODE_NONE ){ + return GetMetaName(); + } + auto nret = DetectUnicode( this ); + if( nret != CODE_NONE && GetStatus() != ESI_NODETECTED ){ + return nret; + } + nret = DetectMBCode( this ); + if( nret != CODE_NONE && GetStatus() != ESI_NODETECTED ){ + return nret; + } + + // デフォルト文字コードを返す + return m_pEncodingConfig->m_eDefaultCodetype; +} + #ifdef _DEBUG /*! @@ -1152,8 +1249,8 @@ void CESI::GetDebugInfo( const char* pS, const int nLen, CNativeW* pcmtxtOut ) CESI cesi( doc.m_cDocType.GetDocumentAttribute().m_encoding ); // テスト実行 - cesi.SetInformation( pS, nLen/*, CODE_SJIS*/ ); - ecode_result = CCodeMediator::CheckKanjiCode( &cesi ); + ecode_result = cesi.CheckKanjiCode(pS, nLen); + ecode_result = CODE_ERROR; // // 判別結果を分析 diff --git a/sakura_core/charset/CESI.h b/sakura_core/charset/CESI.h index 664100704d..212d7d111a 100644 --- a/sakura_core/charset/CESI.h +++ b/sakura_core/charset/CESI.h @@ -91,10 +91,14 @@ class CESI { m_eMetaName = CODE_NONE; } - //! 調査結果の情報を格納 - void SetInformation( const char *pS, const int nLen ); + //! 日本語コードセット判定 + ECodeType CheckKanjiCode(const char* buff, size_t size) noexcept; protected: + ECodeType DetectUnicodeBom(const char* pS, size_t nLen) noexcept; + + //! 調査結果の情報を格納 + void SetInformation( const char *pS, const int nLen ); //! 添え字に使われる優先順位表を作成 void InitPriorityTable( void ); @@ -216,4 +220,39 @@ class CESI { #endif }; +/*! + 文字列の先頭にUnicode系BOMが付いているか? + + @retval CODE_UNICODE UTF-16 LE + @retval CODE_UTF8 UTF-8 + @retval CODE_UNICODEBE UTF-16 BE + @retval CODE_NONE 未検出 + + @date 2007.08.11 charcode.cpp から移動 + @date 2015.03.05 Moca UTF-7 BOMは無効に変更 + */ +inline +ECodeType CESI::DetectUnicodeBom(const char* buff, size_t size) noexcept +{ + if (!buff) return CODE_NONE; + + constexpr const unsigned char szUtf8BOM[]{ 0xef, 0xbb, 0xbf }; + constexpr const unsigned char szUtf16BeBOM[]{ 0xff, 0xfe }; + constexpr const unsigned char szUtf16LeBOM[]{ 0xfe, 0xff }; + + if (size >= _countof(szUtf8BOM) - 1 + && 0 == ::memcmp(buff, szUtf8BOM, _countof(szUtf8BOM) - 1)) { + return CODE_UTF8; + } + if (size >= _countof(szUtf16BeBOM) - 1 + && 0 == ::memcmp(buff, szUtf16BeBOM, _countof(szUtf16BeBOM) - 1)) { + return CODE_UNICODEBE; + } + if (size >= _countof(szUtf16LeBOM) - 1 + && 0 == ::memcmp(buff, szUtf16LeBOM, _countof(szUtf16LeBOM) - 1)) { + return CODE_UNICODE; + } + return CODE_NONE; +} + /*[EOF]*/ diff --git a/sakura_core/io/CFileLoad.cpp b/sakura_core/io/CFileLoad.cpp index 45b81065bc..f30383c170 100644 --- a/sakura_core/io/CFileLoad.cpp +++ b/sakura_core/io/CFileLoad.cpp @@ -157,7 +157,6 @@ ECodeType CFileLoad::FileOpen( LPCWSTR pFileName, bool bBigFile, ECodeType CharC { HANDLE hFile; ULARGE_INTEGER fileSize; - ECodeType nBomCode; // FileCloseを呼んでからにしてください if( NULL != m_hFile ){ @@ -203,14 +202,9 @@ ECodeType CFileLoad::FileOpen( LPCWSTR pFileName, bool bBigFile, ECodeType CharC // データ読み込み Buffering(); - nBomCode = CCodeMediator::DetectUnicodeBom( m_pReadBuf, m_nReadDataLen ); if( CharCode == CODE_AUTODETECT ){ - if( nBomCode != CODE_NONE ){ - CharCode = nBomCode; - }else{ - CCodeMediator mediator(*m_pEencoding); - CharCode = mediator.CheckKanjiCode( m_pReadBuf, m_nReadDataLen ); - } + CCodeMediator mediator(*m_pEencoding); + CharCode = mediator.CheckKanjiCode(m_pReadBuf, m_nReadDataLen); } // To Here Jun. 08, 2003 // 不正な文字コードのときはデフォルト(SJIS:無変換)を設定 From fc94768821082f9cbf27a37bf32a190b44a7e2c2 Mon Sep 17 00:00:00 2001 From: berryplus Date: Sat, 30 Nov 2019 17:47:44 +0900 Subject: [PATCH 2/4] =?UTF-8?q?ICU4C=E3=81=AB=E3=82=88=E3=82=8B=E6=96=87?= =?UTF-8?q?=E5=AD=97=E3=82=B3=E3=83=BC=E3=83=89=E6=A4=9C=E5=87=BA=E6=A9=9F?= =?UTF-8?q?=E8=83=BD=E3=82=92=E8=BF=BD=E5=8A=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sakura/sakura.vcxproj | 4 + sakura/sakura.vcxproj.filters | 15 ++++ sakura_core/Makefile | 2 + sakura_core/charset/CCodeMediator.cpp | 8 ++ sakura_core/charset/icu4c/CharsetDetector.cpp | 77 ++++++++++++++++++ sakura_core/charset/icu4c/CharsetDetector.h | 48 +++++++++++ sakura_core/extmodule/CIcu4cI18n.cpp | 69 ++++++++++++++++ sakura_core/extmodule/CIcu4cI18n.h | 81 +++++++++++++++++++ 8 files changed, 304 insertions(+) create mode 100644 sakura_core/charset/icu4c/CharsetDetector.cpp create mode 100644 sakura_core/charset/icu4c/CharsetDetector.h create mode 100644 sakura_core/extmodule/CIcu4cI18n.cpp create mode 100644 sakura_core/extmodule/CIcu4cI18n.h diff --git a/sakura/sakura.vcxproj b/sakura/sakura.vcxproj index cf905c0f9f..a89b337a14 100644 --- a/sakura/sakura.vcxproj +++ b/sakura/sakura.vcxproj @@ -303,6 +303,7 @@ + @@ -413,6 +414,7 @@ + @@ -647,6 +649,7 @@ + @@ -776,6 +779,7 @@ + diff --git a/sakura/sakura.vcxproj.filters b/sakura/sakura.vcxproj.filters index 0b21690eb5..be2661ada1 100644 --- a/sakura/sakura.vcxproj.filters +++ b/sakura/sakura.vcxproj.filters @@ -119,6 +119,9 @@ {930f3f82-ab3f-49e3-af4a-d4f9c2d51f46} + + {e4629f85-3be8-4dda-80db-1be310929433} + @@ -1085,6 +1088,12 @@ Cpp Source Files\mem + + Cpp Source Files\extmodule + + + Cpp Source Files\charset\icu4c + @@ -2252,6 +2261,12 @@ Cpp Source Files\dlg + + Cpp Source Files\extmodule + + + Cpp Source Files\charset\icu4c + diff --git a/sakura_core/Makefile b/sakura_core/Makefile index 5ea88e13f6..55d68a600b 100644 --- a/sakura_core/Makefile +++ b/sakura_core/Makefile @@ -115,6 +115,7 @@ charset/CUnicode.o \ charset/CUnicodeBe.o \ charset/CUtf7.o \ charset/CUtf8.o \ +charset/icu4c/CharsetDetector.o \ cmd/CViewCommander.o \ cmd/CViewCommander_Bookmark.o \ cmd/CViewCommander_Clipboard.o \ @@ -228,6 +229,7 @@ extmodule/CBregexp.o \ extmodule/CBregexpDll2.o \ extmodule/CDllHandler.o \ extmodule/CHtmlHelp.o \ +extmodule/CIcu4cI18n.o \ extmodule/CMigemo.o \ extmodule/CUxTheme.o \ func/CFuncKeyWnd.o \ diff --git a/sakura_core/charset/CCodeMediator.cpp b/sakura_core/charset/CCodeMediator.cpp index ebfa41b87f..c978dd2b51 100644 --- a/sakura_core/charset/CCodeMediator.cpp +++ b/sakura_core/charset/CCodeMediator.cpp @@ -1,6 +1,7 @@ /*! @file */ #include "StdAfx.h" #include "charset/CCodeMediator.h" +#include "charset/icu4c/CharsetDetector.h" #include "charset/CESI.h" #include "io/CBinaryStream.h" @@ -23,6 +24,13 @@ ECodeType CCodeMediator::CheckKanjiCode(const char* buff, size_t size) noexcept return m_sEncodingConfig.m_eDefaultCodetype; } + // ICU4CのDLL群が利用できる場合、ICU4Cによる判定を試みる + CharsetDetector csd; + if (csd.IsAvailable()) { + auto code = csd.Detect(std::string_view(buff, size)); + if (code != CODE_ERROR) return code; + } + CESI cesi(m_sEncodingConfig); return cesi.CheckKanjiCode(buff, size); } diff --git a/sakura_core/charset/icu4c/CharsetDetector.cpp b/sakura_core/charset/icu4c/CharsetDetector.cpp new file mode 100644 index 0000000000..0979e4b91f --- /dev/null +++ b/sakura_core/charset/icu4c/CharsetDetector.cpp @@ -0,0 +1,77 @@ +/*! @file */ +/* + Copyright (C) 2018-2019 Sakura Editor Organization + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; + you must not claim that you wrote the original software. + If you use this software in a product, an acknowledgment + in the product documentation would be appreciated but is + not required. + + 2. Altered source versions must be plainly marked as such, + and must not be misrepresented as being the original software. + + 3. This notice may not be removed or altered from any source + distribution. +*/ +#include "StdAfx.h" +#include "CharsetDetector.h" + +CharsetDetector::CharsetDetector() noexcept + : _icuin() + , _csd(nullptr) +{ + _icuin.InitDll(); +} + +CharsetDetector::~CharsetDetector() noexcept +{ + if (_icuin.IsAvailable()) { + _icuin.ucsdet_close(_csd); + } +} + +ECodeType CharsetDetector::Detect(const std::string_view& bytes) +{ + UErrorCode status = U_ZERO_ERROR; + + _csd = _icuin.ucsdet_open(&status); + if (status != U_ZERO_ERROR) { + return CODE_ERROR; + } + + _icuin.ucsdet_setText(_csd, bytes.data(), bytes.length(), &status); + if (status != U_ZERO_ERROR) { + return CODE_ERROR; + } + + const auto csm = _icuin.ucsdet_detect(_csd, &status); + if (status != U_ZERO_ERROR) { + return CODE_ERROR; + } + + std::string_view name = _icuin.ucsdet_getName(csm, &status); + if (status != U_ZERO_ERROR) { + return CODE_ERROR; + } + + // 文字セット名⇒サクラエディタ内部コードの変換 + if (name == "UTF-8") return CODE_UTF8; + if (name == "SHIFT_JIS") return CODE_SJIS; + if (name == "UTF-16BE") return CODE_UNICODEBE; + if (name == "UTF-16LE") return CODE_UNICODE; + if (name == "EUC-JP") return CODE_EUC; + if (name == "ISO-2022-JP") return CODE_JIS; + if (name == "UTF-7") return CODE_UTF7; + if (name == "ISO-8859-1") return CODE_LATIN1; + + return CODE_ERROR; +} diff --git a/sakura_core/charset/icu4c/CharsetDetector.h b/sakura_core/charset/icu4c/CharsetDetector.h new file mode 100644 index 0000000000..e43915a4d0 --- /dev/null +++ b/sakura_core/charset/icu4c/CharsetDetector.h @@ -0,0 +1,48 @@ +/*! @file */ +/* + Copyright (C) 2018-2019 Sakura Editor Organization + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; + you must not claim that you wrote the original software. + If you use this software in a product, an acknowledgment + in the product documentation would be appreciated but is + not required. + + 2. Altered source versions must be plainly marked as such, + and must not be misrepresented as being the original software. + + 3. This notice may not be removed or altered from any source + distribution. +*/ +#pragma once + +#include + +#include "extmodule/CIcu4cI18n.h" + +/*! + * @brief 文字コード検出クラス + */ +class CharsetDetector final +{ + CIcu4cI18n _icuin; + UCharsetDetector* _csd; + +public: + CharsetDetector() noexcept; + ~CharsetDetector() noexcept; + + bool IsAvailable() const noexcept { + return _icuin.IsAvailable(); + } + + ECodeType Detect(const std::string_view& bytes); +}; diff --git a/sakura_core/extmodule/CIcu4cI18n.cpp b/sakura_core/extmodule/CIcu4cI18n.cpp new file mode 100644 index 0000000000..8dd3add3ec --- /dev/null +++ b/sakura_core/extmodule/CIcu4cI18n.cpp @@ -0,0 +1,69 @@ +/*! @file */ +/* + Copyright (C) 2018-2019 Sakura Editor Organization + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; + you must not claim that you wrote the original software. + If you use this software in a product, an acknowledgment + in the product documentation would be appreciated but is + not required. + + 2. Altered source versions must be plainly marked as such, + and must not be misrepresented as being the original software. + + 3. This notice may not be removed or altered from any source + distribution. +*/ +#include "StdAfx.h" +#include "CIcu4cI18n.h" + +CIcu4cI18n::CIcu4cI18n() noexcept + : _ucsdet_open(nullptr) + , _ucsdet_setText(nullptr) + , _ucsdet_detect(nullptr) + , _ucsdet_close(nullptr) +{ +} + +CIcu4cI18n::~CIcu4cI18n() noexcept +{ +} + +/*! + * @brief DLLの名前を返す + */ +LPCWSTR CIcu4cI18n::GetDllNameImp(int index) +{ + (void*)index; + return L"icuin66.dll"; //バージョンは固定 +} + +/*! + DLLの初期化 + + 関数のアドレスを取得してメンバに保管する. + + @retval true 成功 + @retval false アドレス取得に失敗 +*/ +bool CIcu4cI18n::InitDllImp() +{ + //DLL内関数名リスト + const ImportTable table[] = { + { &_ucsdet_open, "ucsdet_open_66" }, //バージョンは固定 + { &_ucsdet_setText, "ucsdet_setText_66" }, //バージョンは固定 + { &_ucsdet_detect, "ucsdet_detect_66" }, //バージョンは固定 + { &_ucsdet_getName, "ucsdet_getName_66" }, //バージョンは固定 + { &_ucsdet_close, "ucsdet_close_66" }, //バージョンは固定 + { NULL, 0 } + }; + return RegisterEntries(table); +} diff --git a/sakura_core/extmodule/CIcu4cI18n.h b/sakura_core/extmodule/CIcu4cI18n.h new file mode 100644 index 0000000000..b64daec993 --- /dev/null +++ b/sakura_core/extmodule/CIcu4cI18n.h @@ -0,0 +1,81 @@ +/*! @file */ +/* + Copyright (C) 2018-2019 Sakura Editor Organization + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; + you must not claim that you wrote the original software. + If you use this software in a product, an acknowledgment + in the product documentation would be appreciated but is + not required. + + 2. Altered source versions must be plainly marked as such, + and must not be misrepresented as being the original software. + + 3. This notice may not be removed or altered from any source + distribution. +*/ +#pragma once + +#include "CDllHandler.h" + +//ICU4Cの型定義 +class UCharsetDetector; +class UCharsetMatch; + +typedef enum UErrorCode { + U_ZERO_ERROR = 0, /**< No error, no warning. */ +} UErrorCode; + +/*! + * ICU4C の i18n ライブラリ(icuin.dll) をラップするクラス + */ +class CIcu4cI18n final : public CDllImp +{ + // DLL関数型定義 + typedef UCharsetDetector* (_cdecl *ucsdet_open_t)(UErrorCode *status); + typedef void (_cdecl *ucsdet_setText_t)(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status); + typedef const UCharsetMatch * (_cdecl *ucsdet_detect_t)(UCharsetDetector *ucsd, UErrorCode *status); + typedef const char* (_cdecl *ucsdet_getName_t)(const UCharsetMatch *ucsm, UErrorCode *status); + typedef void (_cdecl *ucsdet_close_t)(UCharsetDetector *ucsd); + + // メンバ定義 + ucsdet_open_t _ucsdet_open; + ucsdet_setText_t _ucsdet_setText; + ucsdet_detect_t _ucsdet_detect; + ucsdet_getName_t _ucsdet_getName; + ucsdet_close_t _ucsdet_close; + +public: + CIcu4cI18n() noexcept; + virtual ~CIcu4cI18n() noexcept; + +protected: + // CDllImpインタフェース + LPCWSTR GetDllNameImp(int nIndex) override; + bool InitDllImp() override; + +public: + inline UCharsetDetector* ucsdet_open(UErrorCode *status) const { + return _ucsdet_open(status); + } + inline void ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status) const { + return _ucsdet_setText(ucsd, textIn, len, status); + } + inline const UCharsetMatch* ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status) const { + return _ucsdet_detect(ucsd, status); + } + inline const char* ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status) const { + return _ucsdet_getName(ucsm, status); + } + inline void ucsdet_close(UCharsetDetector *ucsd) const { + return _ucsdet_close(ucsd); + } +}; From 82c825228d3f34641bbfba2d4f9e457539d5b889 Mon Sep 17 00:00:00 2001 From: berryplus Date: Sun, 1 Dec 2019 02:10:52 +0900 Subject: [PATCH 3/4] =?UTF-8?q?=E3=82=B9=E3=83=9A=E3=83=AB=E3=83=9F?= =?UTF-8?q?=E3=82=B9=E8=A8=82=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 誤) _cdecl 正) __cdecl 呼出規約を表すキーワード__cdeclはアンダースコア2つが正しいです。 --- sakura_core/extmodule/CIcu4cI18n.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sakura_core/extmodule/CIcu4cI18n.h b/sakura_core/extmodule/CIcu4cI18n.h index b64daec993..c05f0d6486 100644 --- a/sakura_core/extmodule/CIcu4cI18n.h +++ b/sakura_core/extmodule/CIcu4cI18n.h @@ -40,11 +40,11 @@ typedef enum UErrorCode { class CIcu4cI18n final : public CDllImp { // DLL関数型定義 - typedef UCharsetDetector* (_cdecl *ucsdet_open_t)(UErrorCode *status); - typedef void (_cdecl *ucsdet_setText_t)(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status); - typedef const UCharsetMatch * (_cdecl *ucsdet_detect_t)(UCharsetDetector *ucsd, UErrorCode *status); - typedef const char* (_cdecl *ucsdet_getName_t)(const UCharsetMatch *ucsm, UErrorCode *status); - typedef void (_cdecl *ucsdet_close_t)(UCharsetDetector *ucsd); + typedef UCharsetDetector* (__cdecl *ucsdet_open_t)(UErrorCode *status); + typedef void (__cdecl *ucsdet_setText_t)(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status); + typedef const UCharsetMatch * (__cdecl *ucsdet_detect_t)(UCharsetDetector *ucsd, UErrorCode *status); + typedef const char* (__cdecl *ucsdet_getName_t)(const UCharsetMatch *ucsm, UErrorCode *status); + typedef void (__cdecl *ucsdet_close_t)(UCharsetDetector *ucsd); // メンバ定義 ucsdet_open_t _ucsdet_open; From da6037b42111b1f6e619c7cf2afbbcc9d0e229d0 Mon Sep 17 00:00:00 2001 From: berryplus Date: Sat, 7 Dec 2019 14:59:59 +0900 Subject: [PATCH 4/4] =?UTF-8?q?BOM=E3=82=B3=E3=83=BC=E3=83=89=E3=81=AB?= =?UTF-8?q?=E9=96=A2=E3=81=99=E3=82=8B=E8=AA=AC=E6=98=8E=E3=82=92=E6=8B=A1?= =?UTF-8?q?=E5=85=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BOMコードが間違っていたのをこっそり修正。 --- sakura_core/charset/CESI.h | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/sakura_core/charset/CESI.h b/sakura_core/charset/CESI.h index 212d7d111a..3cbb808d49 100644 --- a/sakura_core/charset/CESI.h +++ b/sakura_core/charset/CESI.h @@ -234,24 +234,34 @@ class CESI { inline ECodeType CESI::DetectUnicodeBom(const char* buff, size_t size) noexcept { - if (!buff) return CODE_NONE; + // バイト列がない、または、BOM表現を格納できるサイズに満たない場合、判定をスキップ + if (!buff || size < 2) return CODE_NONE; - constexpr const unsigned char szUtf8BOM[]{ 0xef, 0xbb, 0xbf }; - constexpr const unsigned char szUtf16BeBOM[]{ 0xff, 0xfe }; - constexpr const unsigned char szUtf16LeBOM[]{ 0xfe, 0xff }; - - if (size >= _countof(szUtf8BOM) - 1 - && 0 == ::memcmp(buff, szUtf8BOM, _countof(szUtf8BOM) - 1)) { + // バイト列の先頭が \ufeff の utf8 表現と一致するか判定 + constexpr const BYTE utf8BOM[]{ 0xef, 0xbb, 0xbf }; + if (size >= _countof(utf8BOM) && 0 == ::memcmp(buff, utf8BOM, _countof(utf8BOM))) { return CODE_UTF8; } - if (size >= _countof(szUtf16BeBOM) - 1 - && 0 == ::memcmp(buff, szUtf16BeBOM, _countof(szUtf16BeBOM) - 1)) { + + // バイト列の先頭が \ufeff の utf16BE 表現と一致するか判定 + constexpr const BYTE utf16BeBOM[]{ 0xfe, 0xff }; + if (size >= _countof(utf16BeBOM) && 0 == ::memcmp(buff, utf16BeBOM, _countof(utf16BeBOM))) { return CODE_UNICODEBE; } - if (size >= _countof(szUtf16LeBOM) - 1 - && 0 == ::memcmp(buff, szUtf16LeBOM, _countof(szUtf16LeBOM) - 1)) { + + // バイト列の先頭が \ufeff の utf16LE 表現と一致するか判定 + constexpr const BYTE utf16LeBOM[]{ 0xff, 0xfe }; + if (size >= _countof(utf16LeBOM) && 0 == ::memcmp(buff, utf16LeBOM, _countof(utf16LeBOM))) { return CODE_UNICODE; } + + // UTF-7 は ASCII 7bit 文字 でない文字を UTF-16BE で符号化してから 修正BASE64 で 符号化する。 + // Base64 の符号化は 6bit単位 なので BOM に続く文字が非7bit文字な場合、4バイト目がブレる。 + // このため、 UTF-7 については BOM による判別ロジック省略の対象から外している。 + // + // (BOM)abc ⇒ (UTF-7変換) ⇒ +/v8-abc + // (BOM)アイウ ⇒ (UTF-7変換) ⇒ +/v//cf9y/3M- + return CODE_NONE; }