From 00410f22fee1242fa1756824ccfa51d4778873b3 Mon Sep 17 00:00:00 2001
From: berryplus <berryzplus@gmail.com>
Date: Fri, 29 Nov 2019 22:55:19 +0900
Subject: [PATCH 1/4] =?UTF-8?q?CCodeMediator=E3=83=AA=E3=83=95=E3=82=A1?=
 =?UTF-8?q?=E3=82=AF=E3=82=BF=E3=83=AA=E3=83=B3=E3=82=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CCodeMediator.hからCESI.h依存を追い出して仲介者としての役割を果たせるようにする。
---
 sakura_core/_os/CClipboard.cpp        |   6 +-
 sakura_core/charset/CCodeMediator.cpp | 194 +++-----------------------
 sakura_core/charset/CCodeMediator.h   |  33 ++---
 sakura_core/charset/CESI.cpp          | 101 +++++++++++++-
 sakura_core/charset/CESI.h            |  43 +++++-
 sakura_core/io/CFileLoad.cpp          |  10 +-
 6 files changed, 180 insertions(+), 207 deletions(-)

diff --git a/sakura_core/_os/CClipboard.cpp b/sakura_core/_os/CClipboard.cpp
index a6a6e7fade..3673c49488 100644
--- a/sakura_core/_os/CClipboard.cpp
+++ b/sakura_core/_os/CClipboard.cpp
@@ -605,11 +605,7 @@ bool CClipboard::GetClipboradByFormat(CNativeW& mem, const wchar_t* pFormatName,
 		}else{
 			ECodeType eMode = (ECodeType)nMode;
 			if( !IsValidCodeType(eMode) ){
-				// コード不明と99は自動判別
-				ECodeType nBomCode = CCodeMediator::DetectUnicodeBom((const char*)pData, nLength);
-				if( nBomCode != CODE_NONE ){
-					eMode = nBomCode;
-				}else{
+				{
 					const STypeConfig& type = CEditDoc::GetInstance(0)->m_cDocType.GetDocumentAttribute();
 					CCodeMediator mediator(type.m_encoding);
 					eMode = mediator.CheckKanjiCode((const char*)pData, nLength);
diff --git a/sakura_core/charset/CCodeMediator.cpp b/sakura_core/charset/CCodeMediator.cpp
index 2d1515f177..ebfa41b87f 100644
--- a/sakura_core/charset/CCodeMediator.cpp
+++ b/sakura_core/charset/CCodeMediator.cpp
@@ -1,147 +1,8 @@
 ﻿/*! @file */
 #include "StdAfx.h"
 #include "charset/CCodeMediator.h"
-#include "charset/charcode.h"
 #include "charset/CESI.h"
 #include "io/CBinaryStream.h"
-#include "types/CType.h"
-
-/*!
-	文字列の先頭にUnicode系BOMが付いているか？
-
-	@retval CODE_UNICODE   UTF-16 LE
-	@retval CODE_UTF8      UTF-8
-	@retval CODE_UNICODEBE UTF-16 BE
-	@retval CODE_NONE      未検出
-
-	@date 2007.08.11 charcode.cpp から移動
-*/
-ECodeType CCodeMediator::DetectUnicodeBom( const char* pS, const int nLen )
-{
-	uchar_t *pBuf;
-
-	if( NULL == pS ){ return CODE_NONE; }
-
-	pBuf = (uchar_t *) pS;
-	if( 2 <= nLen ){
-		if( pBuf[0] == 0xff && pBuf[1] == 0xfe ){
-			return CODE_UNICODE;
-		}
-		if( pBuf[0] == 0xfe && pBuf[1] == 0xff ){
-			return CODE_UNICODEBE;
-		}
-		if( 3 <= nLen ){
-			if( pBuf[0] == 0xef && pBuf[1] == 0xbb && pBuf[2] == 0xbf ){
-				return CODE_UTF8;
-			}
-		}
-	}
-#if 0
-// 2015.03.05 Moca UTF-7 BOMは無効に変更
-// もしデータがASCII互換でUTF-7として正しければ、文字コード比較でUTF-7になるはず
-	if( 4 <= nLen ){
-		if( memcmp( pBuf, "+/v", 3 ) == 0
-			&& ( pBuf[3] == '8' || pBuf[3] == '9' || pBuf[3] == '+' || pBuf[3] == '/' ) ){
-			return CODE_UTF7;
-		}
-	}
-#endif
-	return CODE_NONE;
-}
-
-/*!
-	SJIS, JIS, EUCJP, UTF-8, UTF-7 を判定 (改)
-
-	@return SJIS, JIS, EUCJP, UTF-8, UTF-7 の何れかの ID を返す．
-
-	@note 適切な検出が行われた場合は、m_dwStatus に CESI_MB_DETECTED フラグが格納される。
-*/
-ECodeType CCodeMediator::DetectMBCode( CESI* pcesi )
-{
-//	pcesi->m_dwStatus = ESI_NOINFORMATION;
-
-	if( pcesi->GetDataLen() < (pcesi->m_apMbcInfo[0]->nSpecific - pcesi->m_apMbcInfo[0]->nPoints) * 2000 ){
-		// 不正バイトの割合が、全体の 0.05% 未満であることを確認。
-		// 全体の0.05%ほどの不正バイトは、無視する。
-		pcesi->SetStatus( ESI_NODETECTED );
-		return CODE_NONE;
-	}
-	if( pcesi->m_apMbcInfo[0]->nPoints <= 0 ){
-		pcesi->SetStatus( ESI_NODETECTED );
-		return CODE_NONE;
-	}
-
-	/*
-		判定状況を確認
-	*/
-	pcesi->SetStatus( ESI_MBC_DETECTED );
-	return pcesi->m_apMbcInfo[0]->eCodeID;
-}
-
-/*!
-	UTF-16 LE/BE を判定.
-
-	@retval CODE_UNICODE    UTF-16 LE が検出された
-	@retval CODE_UNICODEBE  UTF-16 BE が検出された
-	@retval 0               UTF-16 LE/BE ともに検出されなかった
-
-*/
-ECodeType CCodeMediator::DetectUnicode( CESI* pcesi )
-{
-//	pcesi->m_dwStatus = ESI_NOINFORMATION;
-
-	EBOMType ebom_type = pcesi->GetBOMType();
-	int ndatalen;
-	int nlinebreak;
-
-	if( ebom_type == ESI_BOMTYPE_UNKNOWN ){
-		pcesi->SetStatus( ESI_NODETECTED );
-		return CODE_NONE;
-	}
-
-	// 1行の平均桁数が200を超えている場合はUnicode未検出とする
-	ndatalen = pcesi->GetDataLen();
-	nlinebreak = pcesi->m_aWcInfo[ebom_type].nSpecific;  // 改行数を nlinebreakに取得
-	if( static_cast<double>(ndatalen) / nlinebreak > 200 ){
-		pcesi->SetStatus( ESI_NODETECTED );
-		return CODE_NONE;
-	}
-
-	pcesi->SetStatus( ESI_WC_DETECTED );
-	return pcesi->m_aWcInfo[ebom_type].eCodeID;
-}
-
-/*
-	日本語コードセット判定
-*/
-ECodeType CCodeMediator::CheckKanjiCode( CESI* pcesi )
-{
-	ECodeType nret;
-
-	/*
-		判定状況は、
-		DetectMBCode(), DetectUnicode() 内で
-		cesi.m_dwStatus に記録する。
-	*/
-
-	if( pcesi == NULL ){
-		return CODE_DEFAULT;
-	}
-	if( pcesi->GetMetaName() != CODE_NONE ){
-		return pcesi->GetMetaName();
-	}
-	nret = DetectUnicode( pcesi );
-	if( nret != CODE_NONE && pcesi->GetStatus() != ESI_NODETECTED ){
-		return nret;
-	}
-	nret = DetectMBCode( pcesi );
-	if( nret != CODE_NONE && pcesi->GetStatus() != ESI_NODETECTED ){
-		return nret;
-	}
-
-	// デフォルト文字コードを返す
-	return pcesi->m_pEncodingConfig->m_eDefaultCodetype;
-}
 
 /*
 	日本語コードセット判別
@@ -155,18 +16,15 @@ ECodeType CCodeMediator::CheckKanjiCode( CESI* pcesi )
 	UTF-7		CODE_UTF7
 	UnicodeBE	CODE_UNICODEBE
 */
-ECodeType CCodeMediator::CheckKanjiCode( const char* pBuf, int nBufLen )
+ECodeType CCodeMediator::CheckKanjiCode(const char* buff, size_t size) noexcept
 {
-	CESI cesi(*m_pEncodingConfig);
-
-	/*
-		判定状況は、
-		DetectMBCode(), DetectUnicode() 内で
-		cesi.m_dwStatus に記録する。
-	*/
+	// 0バイトならタイプ別のデフォルト設定
+	if (size == 0) {
+		return m_sEncodingConfig.m_eDefaultCodetype;
+	}
 
-	cesi.SetInformation( pBuf, nBufLen/*, CODE_SJIS*/ );
-	return CheckKanjiCode( &cesi );
+	CESI cesi(m_sEncodingConfig);
+	return cesi.CheckKanjiCode(buff, size);
 }
 
 /*
@@ -182,8 +40,12 @@ ECodeType CCodeMediator::CheckKanjiCode( const char* pBuf, int nBufLen )
 ||	UnicodeBE	CODE_UNICODEBE
 ||	エラー		CODE_ERROR
 */
-ECodeType CCodeMediator::CheckKanjiCodeOfFile( const WCHAR* pszFile )
+ECodeType CCodeMediator::CheckKanjiCodeOfFile(const WCHAR* pszFile)
 {
+	if (!pszFile) {
+		return CODE_ERROR;
+	}
+
 	// オープン
 	CBinaryInputStream in(pszFile);
 	if(!in){
@@ -191,33 +53,21 @@ ECodeType CCodeMediator::CheckKanjiCodeOfFile( const WCHAR* pszFile )
 	}
 
 	// データ長取得
-	int nBufLen = in.GetLength();
-	if( nBufLen > CheckKanjiCode_MAXREADLENGTH ){
-		nBufLen = CheckKanjiCode_MAXREADLENGTH;
-	}
-
-	// 0バイトならタイプ別のデフォルト設定
-	if( 0 == nBufLen ){
-		return m_pEncodingConfig->m_eDefaultCodetype;
-	}
+	auto size = std::min<size_t>(in.GetLength(), CheckKanjiCode_MAXREADLENGTH);
 
-	// データ確保
-	CMemory cMem;
-	cMem.AllocBuffer(nBufLen);
-	void* pBuf = cMem.GetRawPtr();
+	std::unique_ptr<char[]> buff;
+	if (size > 0)
+	{
+		// データ確保
+		buff = std::make_unique<char[]>(size);
 
-	// 読み込み
-	nBufLen = in.Read(pBuf, nBufLen);
+		// 読み込み
+		auto ret = in.Read(buff.get(), size);
+	}
 
 	// クローズ
 	in.Close();
 
 	// 日本語コードセット判別
-	ECodeType nCodeType = DetectUnicodeBom( reinterpret_cast<const char*>(pBuf), nBufLen );
-	if( nCodeType == CODE_NONE ){
-		// Unicode BOM は検出されませんでした．
-		nCodeType = CheckKanjiCode( reinterpret_cast<const char*>(pBuf), nBufLen );
-	}
-
-	return nCodeType;
+	return CheckKanjiCode(buff.get(), size);
 }
diff --git a/sakura_core/charset/CCodeMediator.h b/sakura_core/charset/CCodeMediator.h
index 48348d9c0d..6d7af2702b 100644
--- a/sakura_core/charset/CCodeMediator.h
+++ b/sakura_core/charset/CCodeMediator.h
@@ -24,30 +24,27 @@
 */
 #pragma once
 
-#include "charset/CESI.h"
-class CEditDoc;
-
-class CCodeMediator{
-protected:
-	// CESI.cpp の判定関数をここに移す
-	static ECodeType DetectMBCode( CESI* pcesi );
-	static ECodeType DetectUnicode( CESI* pcesi );
-
+#include "types/CType.h" //SEncodingConfig
+
+/*!
+ * @brief CCodeMediator クラス
+ * 
+ * 日本語コードセット判別の詳細を隠ぺいするための仲介クラスです。
+ */
+class CCodeMediator final {
 public:
-
-	explicit CCodeMediator( const SEncodingConfig &ref ) : m_pEncodingConfig(&ref) { }
-
-	static ECodeType DetectUnicodeBom( const char* pS, const int nLen );
+	explicit CCodeMediator(const SEncodingConfig &encodingConfig) noexcept
+		: m_sEncodingConfig(encodingConfig)
+	{
+	}
 
 	/* 日本語コードセット判別 */
-	ECodeType CheckKanjiCode( const char* pBuf, int nBufLen );
+	ECodeType CheckKanjiCode(const char* buff, size_t size) noexcept;
 	/* ファイルの日本語コードセット判別 */
-	ECodeType CheckKanjiCodeOfFile( const WCHAR* pszFile );
-
-	static ECodeType CheckKanjiCode( CESI* pcesi );  // CESI 構造体（？）を外部で構築した場合に使用
+	ECodeType CheckKanjiCodeOfFile(const WCHAR* pszFile);
 
 private:
-	const SEncodingConfig* m_pEncodingConfig;
+	const SEncodingConfig& m_sEncodingConfig;
 };
 
 /*[EOF]*/
diff --git a/sakura_core/charset/CESI.cpp b/sakura_core/charset/CESI.cpp
index 1bd905c234..79d73c04a7 100644
--- a/sakura_core/charset/CESI.cpp
+++ b/sakura_core/charset/CESI.cpp
@@ -1134,6 +1134,103 @@ ECodeType CESI::AutoDetectByCoding( const char* pBuf, int nSize )
 	return CODE_NONE;
 }
 
+/*!
+	SJIS, JIS, EUCJP, UTF-8, UTF-7 を判定 (改)
+
+	@return SJIS, JIS, EUCJP, UTF-8, UTF-7 の何れかの ID を返す．
+
+	@note 適切な検出が行われた場合は、m_dwStatus に CESI_MB_DETECTED フラグが格納される。
+*/
+static ECodeType DetectMBCode( CESI* pcesi )
+{
+//	pcesi->m_dwStatus = ESI_NOINFORMATION;
+
+	if( pcesi->GetDataLen() < (pcesi->m_apMbcInfo[0]->nSpecific - pcesi->m_apMbcInfo[0]->nPoints) * 2000 ){
+		// 不正バイトの割合が、全体の 0.05% 未満であることを確認。
+		// 全体の0.05%ほどの不正バイトは、無視する。
+		pcesi->SetStatus( ESI_NODETECTED );
+		return CODE_NONE;
+	}
+	if( pcesi->m_apMbcInfo[0]->nPoints <= 0 ){
+		pcesi->SetStatus( ESI_NODETECTED );
+		return CODE_NONE;
+	}
+
+	/*
+		判定状況を確認
+	*/
+	pcesi->SetStatus( ESI_MBC_DETECTED );
+	return pcesi->m_apMbcInfo[0]->eCodeID;
+}
+
+/*!
+	UTF-16 LE/BE を判定.
+
+	@retval CODE_UNICODE    UTF-16 LE が検出された
+	@retval CODE_UNICODEBE  UTF-16 BE が検出された
+	@retval 0               UTF-16 LE/BE ともに検出されなかった
+
+*/
+static ECodeType DetectUnicode( CESI* pcesi )
+{
+//	pcesi->m_dwStatus = ESI_NOINFORMATION;
+
+	EBOMType ebom_type = pcesi->GetBOMType();
+	int ndatalen;
+	int nlinebreak;
+
+	if( ebom_type == ESI_BOMTYPE_UNKNOWN ){
+		pcesi->SetStatus( ESI_NODETECTED );
+		return CODE_NONE;
+	}
+
+	// 1行の平均桁数が200を超えている場合はUnicode未検出とする
+	ndatalen = pcesi->GetDataLen();
+	nlinebreak = pcesi->m_aWcInfo[ebom_type].nSpecific;  // 改行数を nlinebreakに取得
+	if( static_cast<double>(ndatalen) / nlinebreak > 200 ){
+		pcesi->SetStatus( ESI_NODETECTED );
+		return CODE_NONE;
+	}
+
+	pcesi->SetStatus( ESI_WC_DETECTED );
+	return pcesi->m_aWcInfo[ebom_type].eCodeID;
+}
+
+/*
+	日本語コードセット判定
+*/
+ECodeType CESI::CheckKanjiCode(const char* pBuf, size_t nBufLen) noexcept
+{
+
+	// 日本語コードセット判別
+	ECodeType nCodeType = DetectUnicodeBom(pBuf, nBufLen);
+	if (nCodeType != CODE_NONE) {
+		return nCodeType;
+	}
+
+	/*
+		判定状況は、
+		DetectMBCode(), DetectUnicode() 内で
+		cesi.m_dwStatus に記録する。
+	*/
+	SetInformation(pBuf, nBufLen);
+
+	if( GetMetaName() != CODE_NONE ){
+		return GetMetaName();
+	}
+	auto nret = DetectUnicode( this );
+	if( nret != CODE_NONE && GetStatus() != ESI_NODETECTED ){
+		return nret;
+	}
+	nret = DetectMBCode( this );
+	if( nret != CODE_NONE && GetStatus() != ESI_NODETECTED ){
+		return nret;
+	}
+
+	// デフォルト文字コードを返す
+	return m_pEncodingConfig->m_eDefaultCodetype;
+}
+
 #ifdef _DEBUG
 
 /*!
@@ -1152,8 +1249,8 @@ void CESI::GetDebugInfo( const char* pS, const int nLen, CNativeW* pcmtxtOut )
 	CESI cesi( doc.m_cDocType.GetDocumentAttribute().m_encoding );
 
 	// テスト実行
-	cesi.SetInformation( pS, nLen/*, CODE_SJIS*/ );
-	ecode_result = CCodeMediator::CheckKanjiCode( &cesi );
+	ecode_result = cesi.CheckKanjiCode(pS, nLen);
+	ecode_result = CODE_ERROR;
 
 	//
 	//	判別結果を分析
diff --git a/sakura_core/charset/CESI.h b/sakura_core/charset/CESI.h
index 664100704d..212d7d111a 100644
--- a/sakura_core/charset/CESI.h
+++ b/sakura_core/charset/CESI.h
@@ -91,10 +91,14 @@ class CESI {
 		m_eMetaName = CODE_NONE;
 	}
 
-	//! 調査結果の情報を格納
-	void SetInformation( const char *pS, const int nLen );
+	//! 日本語コードセット判定
+	ECodeType CheckKanjiCode(const char* buff, size_t size) noexcept;
 
 protected:
+	ECodeType DetectUnicodeBom(const char* pS, size_t nLen) noexcept;
+
+	//! 調査結果の情報を格納
+	void SetInformation( const char *pS, const int nLen );
 
 	//! 添え字に使われる優先順位表を作成
 	void InitPriorityTable( void );
@@ -216,4 +220,39 @@ class CESI {
 #endif
 };
 
+/*!
+	文字列の先頭にUnicode系BOMが付いているか？
+
+	@retval CODE_UNICODE   UTF-16 LE
+	@retval CODE_UTF8      UTF-8
+	@retval CODE_UNICODEBE UTF-16 BE
+	@retval CODE_NONE      未検出
+
+	@date 2007.08.11 charcode.cpp から移動
+	@date 2015.03.05 Moca UTF-7 BOMは無効に変更
+ */
+inline
+ECodeType CESI::DetectUnicodeBom(const char* buff, size_t size) noexcept
+{
+	if (!buff) return CODE_NONE;
+
+	constexpr const unsigned char szUtf8BOM[]{ 0xef, 0xbb, 0xbf };
+	constexpr const unsigned char szUtf16BeBOM[]{ 0xff, 0xfe };
+	constexpr const unsigned char szUtf16LeBOM[]{ 0xfe, 0xff };
+
+	if (size >= _countof(szUtf8BOM) - 1
+		&& 0 == ::memcmp(buff, szUtf8BOM, _countof(szUtf8BOM) - 1)) {
+		return CODE_UTF8;
+	}
+	if (size >= _countof(szUtf16BeBOM) - 1
+		&& 0 == ::memcmp(buff, szUtf16BeBOM, _countof(szUtf16BeBOM) - 1)) {
+		return CODE_UNICODEBE;
+	}
+	if (size >= _countof(szUtf16LeBOM) - 1
+		&& 0 == ::memcmp(buff, szUtf16LeBOM, _countof(szUtf16LeBOM) - 1)) {
+		return CODE_UNICODE;
+	}
+	return CODE_NONE;
+}
+
 /*[EOF]*/
diff --git a/sakura_core/io/CFileLoad.cpp b/sakura_core/io/CFileLoad.cpp
index 45b81065bc..f30383c170 100644
--- a/sakura_core/io/CFileLoad.cpp
+++ b/sakura_core/io/CFileLoad.cpp
@@ -157,7 +157,6 @@ ECodeType CFileLoad::FileOpen( LPCWSTR pFileName, bool bBigFile, ECodeType CharC
 {
 	HANDLE	hFile;
 	ULARGE_INTEGER	fileSize;
-	ECodeType	nBomCode;
 
 	// FileCloseを呼んでからにしてください
 	if( NULL != m_hFile ){
@@ -203,14 +202,9 @@ ECodeType CFileLoad::FileOpen( LPCWSTR pFileName, bool bBigFile, ECodeType CharC
 	// データ読み込み
 	Buffering();
 
-	nBomCode = CCodeMediator::DetectUnicodeBom( m_pReadBuf, m_nReadDataLen );
 	if( CharCode == CODE_AUTODETECT ){
-		if( nBomCode != CODE_NONE ){
-			CharCode = nBomCode;
-		}else{
-			CCodeMediator mediator(*m_pEencoding);
-			CharCode = mediator.CheckKanjiCode( m_pReadBuf, m_nReadDataLen );
-		}
+		CCodeMediator mediator(*m_pEencoding);
+		CharCode = mediator.CheckKanjiCode(m_pReadBuf, m_nReadDataLen);
 	}
 	// To Here Jun. 08, 2003
 	// 不正な文字コードのときはデフォルト(SJIS:無変換)を設定

From fc94768821082f9cbf27a37bf32a190b44a7e2c2 Mon Sep 17 00:00:00 2001
From: berryplus <berryzplus@gmail.com>
Date: Sat, 30 Nov 2019 17:47:44 +0900
Subject: [PATCH 2/4] =?UTF-8?q?ICU4C=E3=81=AB=E3=82=88=E3=82=8B=E6=96=87?=
 =?UTF-8?q?=E5=AD=97=E3=82=B3=E3=83=BC=E3=83=89=E6=A4=9C=E5=87=BA=E6=A9=9F?=
 =?UTF-8?q?=E8=83=BD=E3=82=92=E8=BF=BD=E5=8A=A0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 sakura/sakura.vcxproj                         |  4 +
 sakura/sakura.vcxproj.filters                 | 15 ++++
 sakura_core/Makefile                          |  2 +
 sakura_core/charset/CCodeMediator.cpp         |  8 ++
 sakura_core/charset/icu4c/CharsetDetector.cpp | 77 ++++++++++++++++++
 sakura_core/charset/icu4c/CharsetDetector.h   | 48 +++++++++++
 sakura_core/extmodule/CIcu4cI18n.cpp          | 69 ++++++++++++++++
 sakura_core/extmodule/CIcu4cI18n.h            | 81 +++++++++++++++++++
 8 files changed, 304 insertions(+)
 create mode 100644 sakura_core/charset/icu4c/CharsetDetector.cpp
 create mode 100644 sakura_core/charset/icu4c/CharsetDetector.h
 create mode 100644 sakura_core/extmodule/CIcu4cI18n.cpp
 create mode 100644 sakura_core/extmodule/CIcu4cI18n.h

diff --git a/sakura/sakura.vcxproj b/sakura/sakura.vcxproj
index cf905c0f9f..a89b337a14 100644
--- a/sakura/sakura.vcxproj
+++ b/sakura/sakura.vcxproj
@@ -303,6 +303,7 @@
     <ClInclude Include="..\sakura_core\charset\CUnicodeBe.h" />
     <ClInclude Include="..\sakura_core\charset\CUtf7.h" />
     <ClInclude Include="..\sakura_core\charset\CUtf8.h" />
+    <ClInclude Include="..\sakura_core\charset\icu4c\CharsetDetector.h" />
     <ClInclude Include="..\sakura_core\CHokanMgr.h" />
     <ClInclude Include="..\sakura_core\CKeyWordSetMgr.h" />
     <ClInclude Include="..\sakura_core\CLoadAgent.h" />
@@ -413,6 +414,7 @@
     <ClInclude Include="..\sakura_core\extmodule\CBregexpDll2.h" />
     <ClInclude Include="..\sakura_core\extmodule\CDllHandler.h" />
     <ClInclude Include="..\sakura_core\extmodule\CHtmlHelp.h" />
+    <ClInclude Include="..\sakura_core\extmodule\CIcu4cI18n.h" />
     <ClInclude Include="..\sakura_core\extmodule\CMigemo.h" />
     <ClInclude Include="..\sakura_core\extmodule\CUxTheme.h" />
     <ClInclude Include="..\sakura_core\Funccode_define.h" />
@@ -647,6 +649,7 @@
     <ClCompile Include="..\sakura_core\charset\CUnicodeBe.cpp" />
     <ClCompile Include="..\sakura_core\charset\CUtf7.cpp" />
     <ClCompile Include="..\sakura_core\charset\CUtf8.cpp" />
+    <ClCompile Include="..\sakura_core\charset\icu4c\CharsetDetector.cpp" />
     <ClCompile Include="..\sakura_core\CHokanMgr.cpp" />
     <ClCompile Include="..\sakura_core\CKeyWordSetMgr.cpp" />
     <ClCompile Include="..\sakura_core\CLoadAgent.cpp" />
@@ -776,6 +779,7 @@
     <ClCompile Include="..\sakura_core\extmodule\CBregexpDll2.cpp" />
     <ClCompile Include="..\sakura_core\extmodule\CDllHandler.cpp" />
     <ClCompile Include="..\sakura_core\extmodule\CHtmlHelp.cpp" />
+    <ClCompile Include="..\sakura_core\extmodule\CIcu4cI18n.cpp" />
     <ClCompile Include="..\sakura_core\extmodule\CMigemo.cpp" />
     <ClCompile Include="..\sakura_core\extmodule\CUxTheme.cpp" />
     <ClCompile Include="..\sakura_core\func\CFuncKeyWnd.cpp" />
diff --git a/sakura/sakura.vcxproj.filters b/sakura/sakura.vcxproj.filters
index 0b21690eb5..be2661ada1 100644
--- a/sakura/sakura.vcxproj.filters
+++ b/sakura/sakura.vcxproj.filters
@@ -119,6 +119,9 @@
     <Filter Include="Cpp Source Files\uiparts">
       <UniqueIdentifier>{930f3f82-ab3f-49e3-af4a-d4f9c2d51f46}</UniqueIdentifier>
     </Filter>
+    <Filter Include="Cpp Source Files\charset\icu4c">
+      <UniqueIdentifier>{e4629f85-3be8-4dda-80db-1be310929433}</UniqueIdentifier>
+    </Filter>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\sakura_core\Funccode_define.h">
@@ -1085,6 +1088,12 @@
     <ClInclude Include="..\sakura_core\mem\CPoolResource.h">
       <Filter>Cpp Source Files\mem</Filter>
     </ClInclude>
+    <ClInclude Include="..\sakura_core\extmodule\CIcu4cI18n.h">
+      <Filter>Cpp Source Files\extmodule</Filter>
+    </ClInclude>
+    <ClInclude Include="..\sakura_core\charset\icu4c\CharsetDetector.h">
+      <Filter>Cpp Source Files\charset\icu4c</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <None Include="..\resource\auto_scroll_center.cur">
@@ -2252,6 +2261,12 @@
     <ClCompile Include="..\sakura_core\dlg\CDlgOpenFile_CommonItemDialog.cpp">
       <Filter>Cpp Source Files\dlg</Filter>
     </ClCompile>
+    <ClCompile Include="..\sakura_core\extmodule\CIcu4cI18n.cpp">
+      <Filter>Cpp Source Files\extmodule</Filter>
+    </ClCompile>
+    <ClCompile Include="..\sakura_core\charset\icu4c\CharsetDetector.cpp">
+      <Filter>Cpp Source Files\charset\icu4c</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <Image Include="..\resource\auto_scroll_center.bmp">
diff --git a/sakura_core/Makefile b/sakura_core/Makefile
index 5ea88e13f6..55d68a600b 100644
--- a/sakura_core/Makefile
+++ b/sakura_core/Makefile
@@ -115,6 +115,7 @@ charset/CUnicode.o \
 charset/CUnicodeBe.o \
 charset/CUtf7.o \
 charset/CUtf8.o \
+charset/icu4c/CharsetDetector.o \
 cmd/CViewCommander.o \
 cmd/CViewCommander_Bookmark.o \
 cmd/CViewCommander_Clipboard.o \
@@ -228,6 +229,7 @@ extmodule/CBregexp.o \
 extmodule/CBregexpDll2.o \
 extmodule/CDllHandler.o \
 extmodule/CHtmlHelp.o \
+extmodule/CIcu4cI18n.o \
 extmodule/CMigemo.o \
 extmodule/CUxTheme.o \
 func/CFuncKeyWnd.o \
diff --git a/sakura_core/charset/CCodeMediator.cpp b/sakura_core/charset/CCodeMediator.cpp
index ebfa41b87f..c978dd2b51 100644
--- a/sakura_core/charset/CCodeMediator.cpp
+++ b/sakura_core/charset/CCodeMediator.cpp
@@ -1,6 +1,7 @@
 ﻿/*! @file */
 #include "StdAfx.h"
 #include "charset/CCodeMediator.h"
+#include "charset/icu4c/CharsetDetector.h"
 #include "charset/CESI.h"
 #include "io/CBinaryStream.h"
 
@@ -23,6 +24,13 @@ ECodeType CCodeMediator::CheckKanjiCode(const char* buff, size_t size) noexcept
 		return m_sEncodingConfig.m_eDefaultCodetype;
 	}
 
+	// ICU4CのDLL群が利用できる場合、ICU4Cによる判定を試みる
+	CharsetDetector csd;
+	if (csd.IsAvailable()) {
+		auto code = csd.Detect(std::string_view(buff, size));
+		if (code != CODE_ERROR) return code;
+	}
+
 	CESI cesi(m_sEncodingConfig);
 	return cesi.CheckKanjiCode(buff, size);
 }
diff --git a/sakura_core/charset/icu4c/CharsetDetector.cpp b/sakura_core/charset/icu4c/CharsetDetector.cpp
new file mode 100644
index 0000000000..0979e4b91f
--- /dev/null
+++ b/sakura_core/charset/icu4c/CharsetDetector.cpp
@@ -0,0 +1,77 @@
+﻿/*! @file */
+/*
+	Copyright (C) 2018-2019 Sakura Editor Organization
+
+	This software is provided 'as-is', without any express or implied
+	warranty. In no event will the authors be held liable for any damages
+	arising from the use of this software.
+
+	Permission is granted to anyone to use this software for any purpose,
+	including commercial applications, and to alter it and redistribute it
+	freely, subject to the following restrictions:
+
+		1. The origin of this software must not be misrepresented;
+		   you must not claim that you wrote the original software.
+		   If you use this software in a product, an acknowledgment
+		   in the product documentation would be appreciated but is
+		   not required.
+
+		2. Altered source versions must be plainly marked as such,
+		   and must not be misrepresented as being the original software.
+
+		3. This notice may not be removed or altered from any source
+		   distribution.
+*/
+#include "StdAfx.h"
+#include "CharsetDetector.h"
+
+CharsetDetector::CharsetDetector() noexcept
+	: _icuin()
+	, _csd(nullptr)
+{
+	_icuin.InitDll();
+}
+
+CharsetDetector::~CharsetDetector() noexcept
+{
+	if (_icuin.IsAvailable()) {
+		_icuin.ucsdet_close(_csd);
+	}
+}
+
+ECodeType CharsetDetector::Detect(const std::string_view& bytes)
+{
+	UErrorCode status = U_ZERO_ERROR;
+
+	_csd = _icuin.ucsdet_open(&status);
+	if (status != U_ZERO_ERROR) {
+		return CODE_ERROR;
+	}
+
+	_icuin.ucsdet_setText(_csd, bytes.data(), bytes.length(), &status);
+	if (status != U_ZERO_ERROR) {
+		return CODE_ERROR;
+	}
+
+	const auto csm = _icuin.ucsdet_detect(_csd, &status);
+	if (status != U_ZERO_ERROR) {
+		return CODE_ERROR;
+	}
+
+	std::string_view name = _icuin.ucsdet_getName(csm, &status);
+	if (status != U_ZERO_ERROR) {
+		return CODE_ERROR;
+	}
+
+	// 文字セット名⇒サクラエディタ内部コードの変換
+	if (name == "UTF-8") return CODE_UTF8;
+	if (name == "SHIFT_JIS") return CODE_SJIS;
+	if (name == "UTF-16BE") return CODE_UNICODEBE;
+	if (name == "UTF-16LE") return CODE_UNICODE;
+	if (name == "EUC-JP") return CODE_EUC;
+	if (name == "ISO-2022-JP") return CODE_JIS;
+	if (name == "UTF-7") return CODE_UTF7;
+	if (name == "ISO-8859-1") return CODE_LATIN1;
+
+	return CODE_ERROR;
+}
diff --git a/sakura_core/charset/icu4c/CharsetDetector.h b/sakura_core/charset/icu4c/CharsetDetector.h
new file mode 100644
index 0000000000..e43915a4d0
--- /dev/null
+++ b/sakura_core/charset/icu4c/CharsetDetector.h
@@ -0,0 +1,48 @@
+﻿/*! @file */
+/*
+	Copyright (C) 2018-2019 Sakura Editor Organization
+
+	This software is provided 'as-is', without any express or implied
+	warranty. In no event will the authors be held liable for any damages
+	arising from the use of this software.
+
+	Permission is granted to anyone to use this software for any purpose,
+	including commercial applications, and to alter it and redistribute it
+	freely, subject to the following restrictions:
+
+		1. The origin of this software must not be misrepresented;
+		   you must not claim that you wrote the original software.
+		   If you use this software in a product, an acknowledgment
+		   in the product documentation would be appreciated but is
+		   not required.
+
+		2. Altered source versions must be plainly marked as such,
+		   and must not be misrepresented as being the original software.
+
+		3. This notice may not be removed or altered from any source
+		   distribution.
+*/
+#pragma once
+
+#include <string_view>
+
+#include "extmodule/CIcu4cI18n.h"
+
+/*!
+ * @brief 文字コード検出クラス
+ */
+class CharsetDetector final
+{
+	CIcu4cI18n _icuin;
+	UCharsetDetector* _csd;
+
+public:
+	CharsetDetector() noexcept;
+	~CharsetDetector() noexcept;
+
+	bool IsAvailable() const noexcept {
+		return _icuin.IsAvailable();
+	}
+
+	ECodeType Detect(const std::string_view& bytes);
+};
diff --git a/sakura_core/extmodule/CIcu4cI18n.cpp b/sakura_core/extmodule/CIcu4cI18n.cpp
new file mode 100644
index 0000000000..8dd3add3ec
--- /dev/null
+++ b/sakura_core/extmodule/CIcu4cI18n.cpp
@@ -0,0 +1,69 @@
+﻿/*! @file */
+/*
+	Copyright (C) 2018-2019 Sakura Editor Organization
+
+	This software is provided 'as-is', without any express or implied
+	warranty. In no event will the authors be held liable for any damages
+	arising from the use of this software.
+
+	Permission is granted to anyone to use this software for any purpose,
+	including commercial applications, and to alter it and redistribute it
+	freely, subject to the following restrictions:
+
+		1. The origin of this software must not be misrepresented;
+		   you must not claim that you wrote the original software.
+		   If you use this software in a product, an acknowledgment
+		   in the product documentation would be appreciated but is
+		   not required.
+
+		2. Altered source versions must be plainly marked as such,
+		   and must not be misrepresented as being the original software.
+
+		3. This notice may not be removed or altered from any source
+		   distribution.
+*/
+#include "StdAfx.h"
+#include "CIcu4cI18n.h"
+
+CIcu4cI18n::CIcu4cI18n() noexcept
+	: _ucsdet_open(nullptr)
+	, _ucsdet_setText(nullptr)
+	, _ucsdet_detect(nullptr)
+	, _ucsdet_close(nullptr)
+{
+}
+
+CIcu4cI18n::~CIcu4cI18n() noexcept
+{
+}
+
+/*!
+ * @brief DLLの名前を返す
+ */
+LPCWSTR CIcu4cI18n::GetDllNameImp(int index)
+{
+	(void*)index;
+	return L"icuin66.dll"; //バージョンは固定
+}
+
+/*!
+	DLLの初期化
+
+	関数のアドレスを取得してメンバに保管する．
+
+	@retval true 成功
+	@retval false アドレス取得に失敗
+*/
+bool CIcu4cI18n::InitDllImp()
+{
+	//DLL内関数名リスト
+	const ImportTable table[] = {
+		{ &_ucsdet_open,		"ucsdet_open_66" },		//バージョンは固定
+		{ &_ucsdet_setText,		"ucsdet_setText_66" },	//バージョンは固定
+		{ &_ucsdet_detect,		"ucsdet_detect_66" },	//バージョンは固定
+		{ &_ucsdet_getName,		"ucsdet_getName_66" },	//バージョンは固定
+		{ &_ucsdet_close,		"ucsdet_close_66" },	//バージョンは固定
+		{ NULL, 0 }
+	};
+	return RegisterEntries(table);
+}
diff --git a/sakura_core/extmodule/CIcu4cI18n.h b/sakura_core/extmodule/CIcu4cI18n.h
new file mode 100644
index 0000000000..b64daec993
--- /dev/null
+++ b/sakura_core/extmodule/CIcu4cI18n.h
@@ -0,0 +1,81 @@
+﻿/*! @file */
+/*
+	Copyright (C) 2018-2019 Sakura Editor Organization
+
+	This software is provided 'as-is', without any express or implied
+	warranty. In no event will the authors be held liable for any damages
+	arising from the use of this software.
+
+	Permission is granted to anyone to use this software for any purpose,
+	including commercial applications, and to alter it and redistribute it
+	freely, subject to the following restrictions:
+
+		1. The origin of this software must not be misrepresented;
+		   you must not claim that you wrote the original software.
+		   If you use this software in a product, an acknowledgment
+		   in the product documentation would be appreciated but is
+		   not required.
+
+		2. Altered source versions must be plainly marked as such,
+		   and must not be misrepresented as being the original software.
+
+		3. This notice may not be removed or altered from any source
+		   distribution.
+*/
+#pragma once
+
+#include "CDllHandler.h"
+
+//ICU4Cの型定義
+class UCharsetDetector;
+class UCharsetMatch;
+
+typedef enum UErrorCode {
+	U_ZERO_ERROR = 0,     /**< No error, no warning. */
+} UErrorCode;
+
+/*!
+ * ICU4C の i18n ライブラリ(icuin.dll) をラップするクラス
+ */
+class CIcu4cI18n final : public CDllImp
+{
+	// DLL関数型定義
+	typedef UCharsetDetector*		(_cdecl *ucsdet_open_t)(UErrorCode *status);
+	typedef void					(_cdecl *ucsdet_setText_t)(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status);
+	typedef const UCharsetMatch *	(_cdecl *ucsdet_detect_t)(UCharsetDetector *ucsd, UErrorCode *status);
+	typedef const char*				(_cdecl *ucsdet_getName_t)(const UCharsetMatch *ucsm, UErrorCode *status);
+	typedef void					(_cdecl *ucsdet_close_t)(UCharsetDetector *ucsd);
+
+	// メンバ定義
+	ucsdet_open_t		_ucsdet_open;
+	ucsdet_setText_t	_ucsdet_setText;
+	ucsdet_detect_t		_ucsdet_detect;
+	ucsdet_getName_t	_ucsdet_getName;
+	ucsdet_close_t		_ucsdet_close;
+
+public:
+	CIcu4cI18n() noexcept;
+	virtual ~CIcu4cI18n() noexcept;
+
+protected:
+	// CDllImpインタフェース
+	LPCWSTR GetDllNameImp(int nIndex) override;
+	bool InitDllImp() override;
+
+public:
+	inline UCharsetDetector* ucsdet_open(UErrorCode *status) const {
+		return _ucsdet_open(status);
+	}
+	inline void ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status) const {
+		return _ucsdet_setText(ucsd, textIn, len, status);
+	}
+	inline const UCharsetMatch* ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status) const {
+		return _ucsdet_detect(ucsd, status);
+	}
+	inline const char* ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status) const {
+		return _ucsdet_getName(ucsm, status);
+	}
+	inline void ucsdet_close(UCharsetDetector *ucsd) const {
+		return _ucsdet_close(ucsd);
+	}
+};

From 82c825228d3f34641bbfba2d4f9e457539d5b889 Mon Sep 17 00:00:00 2001
From: berryplus <berryzplus@gmail.com>
Date: Sun, 1 Dec 2019 02:10:52 +0900
Subject: [PATCH 3/4] =?UTF-8?q?=E3=82=B9=E3=83=9A=E3=83=AB=E3=83=9F?=
 =?UTF-8?q?=E3=82=B9=E8=A8=82=E6=AD=A3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

誤) _cdecl
正) __cdecl
呼出規約を表すキーワード__cdeclはアンダースコア2つが正しいです。
---
 sakura_core/extmodule/CIcu4cI18n.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sakura_core/extmodule/CIcu4cI18n.h b/sakura_core/extmodule/CIcu4cI18n.h
index b64daec993..c05f0d6486 100644
--- a/sakura_core/extmodule/CIcu4cI18n.h
+++ b/sakura_core/extmodule/CIcu4cI18n.h
@@ -40,11 +40,11 @@ typedef enum UErrorCode {
 class CIcu4cI18n final : public CDllImp
 {
 	// DLL関数型定義
-	typedef UCharsetDetector*		(_cdecl *ucsdet_open_t)(UErrorCode *status);
-	typedef void					(_cdecl *ucsdet_setText_t)(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status);
-	typedef const UCharsetMatch *	(_cdecl *ucsdet_detect_t)(UCharsetDetector *ucsd, UErrorCode *status);
-	typedef const char*				(_cdecl *ucsdet_getName_t)(const UCharsetMatch *ucsm, UErrorCode *status);
-	typedef void					(_cdecl *ucsdet_close_t)(UCharsetDetector *ucsd);
+	typedef UCharsetDetector*		(__cdecl *ucsdet_open_t)(UErrorCode *status);
+	typedef void					(__cdecl *ucsdet_setText_t)(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status);
+	typedef const UCharsetMatch *	(__cdecl *ucsdet_detect_t)(UCharsetDetector *ucsd, UErrorCode *status);
+	typedef const char*				(__cdecl *ucsdet_getName_t)(const UCharsetMatch *ucsm, UErrorCode *status);
+	typedef void					(__cdecl *ucsdet_close_t)(UCharsetDetector *ucsd);
 
 	// メンバ定義
 	ucsdet_open_t		_ucsdet_open;

From da6037b42111b1f6e619c7cf2afbbcc9d0e229d0 Mon Sep 17 00:00:00 2001
From: berryplus <berryzplus@gmail.com>
Date: Sat, 7 Dec 2019 14:59:59 +0900
Subject: [PATCH 4/4] =?UTF-8?q?BOM=E3=82=B3=E3=83=BC=E3=83=89=E3=81=AB?=
 =?UTF-8?q?=E9=96=A2=E3=81=99=E3=82=8B=E8=AA=AC=E6=98=8E=E3=82=92=E6=8B=A1?=
 =?UTF-8?q?=E5=85=85?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

BOMコードが間違っていたのをこっそり修正。
---
 sakura_core/charset/CESI.h | 32 +++++++++++++++++++++-----------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/sakura_core/charset/CESI.h b/sakura_core/charset/CESI.h
index 212d7d111a..3cbb808d49 100644
--- a/sakura_core/charset/CESI.h
+++ b/sakura_core/charset/CESI.h
@@ -234,24 +234,34 @@ class CESI {
 inline
 ECodeType CESI::DetectUnicodeBom(const char* buff, size_t size) noexcept
 {
-	if (!buff) return CODE_NONE;
+	// バイト列がない、または、BOM表現を格納できるサイズに満たない場合、判定をスキップ
+	if (!buff || size < 2) return CODE_NONE;
 
-	constexpr const unsigned char szUtf8BOM[]{ 0xef, 0xbb, 0xbf };
-	constexpr const unsigned char szUtf16BeBOM[]{ 0xff, 0xfe };
-	constexpr const unsigned char szUtf16LeBOM[]{ 0xfe, 0xff };
-
-	if (size >= _countof(szUtf8BOM) - 1
-		&& 0 == ::memcmp(buff, szUtf8BOM, _countof(szUtf8BOM) - 1)) {
+	// バイト列の先頭が \ufeff の utf8 表現と一致するか判定
+	constexpr const BYTE utf8BOM[]{ 0xef, 0xbb, 0xbf };
+	if (size >= _countof(utf8BOM) && 0 == ::memcmp(buff, utf8BOM, _countof(utf8BOM))) {
 		return CODE_UTF8;
 	}
-	if (size >= _countof(szUtf16BeBOM) - 1
-		&& 0 == ::memcmp(buff, szUtf16BeBOM, _countof(szUtf16BeBOM) - 1)) {
+
+	// バイト列の先頭が \ufeff の utf16BE 表現と一致するか判定
+	constexpr const BYTE utf16BeBOM[]{ 0xfe, 0xff };
+	if (size >= _countof(utf16BeBOM) && 0 == ::memcmp(buff, utf16BeBOM, _countof(utf16BeBOM))) {
 		return CODE_UNICODEBE;
 	}
-	if (size >= _countof(szUtf16LeBOM) - 1
-		&& 0 == ::memcmp(buff, szUtf16LeBOM, _countof(szUtf16LeBOM) - 1)) {
+
+	// バイト列の先頭が \ufeff の utf16LE 表現と一致するか判定
+	constexpr const BYTE utf16LeBOM[]{ 0xff, 0xfe };
+	if (size >= _countof(utf16LeBOM) && 0 == ::memcmp(buff, utf16LeBOM, _countof(utf16LeBOM))) {
 		return CODE_UNICODE;
 	}
+
+	// UTF-7 は ASCII 7bit 文字 でない文字を UTF-16BE で符号化してから 修正BASE64 で 符号化する。
+	// Base64 の符号化は 6bit単位 なので BOM に続く文字が非7bit文字な場合、4バイト目がブレる。
+	// このため、 UTF-7 については BOM による判別ロジック省略の対象から外している。
+	// 
+	// (BOM)abc ⇒ (UTF-7変換) ⇒ +/v8-abc
+	// (BOM)ｱｲｳ ⇒ (UTF-7変換) ⇒ +/v//cf9y/3M-
+
 	return CODE_NONE;
 }