Skip to content

Commit

Permalink
2024/04/24あたりからのHTMLの変更(どんぐり関連)への対応 #11
Browse files Browse the repository at this point in the history
  • Loading branch information
onihusube committed Apr 25, 2024
1 parent c38805a commit 54d7444
Show file tree
Hide file tree
Showing 3 changed files with 1,462 additions and 7 deletions.
40 changes: 33 additions & 7 deletions HtmlConverter/HtmltoDat.cs
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ public Byte[] Gethtml(String URI, int range, String UA, bool CRReplace, String L
System.Diagnostics.Debug.WriteLine("CGI ver202306形式");

// 1300行ほど飛ばす
// /1-で取ると1300行くらいしか前がない場合がある(なんG/なんJとか)
// /1-で取ると1390行くらいしか前がない場合がある(なんG/なんJとか)
for (int i = 0; i <= 1300; ++i)
{
html.ReadLine();
Expand Down Expand Up @@ -295,13 +295,27 @@ private StringBuilder CGI202306_ConvertProcess(string title, string URI, string
// レスの連続抽出はざっくりとやる
var ResMatches = Regex.Matches(allres, @"<article id=.+?</section></article>");
// ↑で抽出した1つのレス内で各要素を抽出
Regex ResContent = new Regex(@"<article id=.(?<num>\d+?).+?<summary>.+?<span class=.postusername.>(?<name><b>.+?</b>)</span></summary><span class=.date.>(?<date>.+?)</span><span class=.uid.>(?<id>.*?)</span>(?<be><span class=.be.+?</span>)?</details><section class=.post-content.>(?<massage>.+?)</section></article>");
Regex ResContent
= new Regex(@"<article id=.(?<num>\d+?).+?<summary>.+?<span class=.postusername.>(?<name><b>.+?</b>)</span></summary><span class=.date.>(?<date>.+?)</span><span class=.uid.>(?<id>.*?)</span>(?<be><span class=.be.+?</span>)?</details><section class=.post-content.>(?<massage>.+?)</section></article>");
Regex ResContentWithAcorn
= new Regex(@"<article id=.(?<num>\d+?).+?<summary>.+?<span class=.postusername.>(?<name><b>.+?</b>)</span><span style=.+?</summary><span style=.+?><span class=.date.>(?<date>.+?)</span><span class=.uid.>(?<id>.*?)</span></span>(?<be><span class=.be.+?</span>)?</details><section class=.post-content.>(?<massage>.+?)</section></article>");

// 旧型式(API移行直後のhtml形式)の処理を再利用するために、レス部分のhtmlを1レスづつ旧型式に変換する
// 細部のハンドリングを継承するための措置
foreach (Match resmatch in ResMatches)
{
Match res_content = ResContent.Match(resmatch.Value);
Match res_content;

// どんぐりの有効性によって抽出方法を変更
if (ResContent.IsMatch(resmatch.Value))
{
res_content = ResContent.Match(resmatch.Value);
}
else
{
res_content = ResContentWithAcorn.Match(resmatch.Value);
}


string resnumber = res_content.Groups["num"].Value;
string name = res_content.Groups["name"].Value;
Expand Down Expand Up @@ -800,13 +814,13 @@ private static String html2dat(String res)
{
ResBody.Replace(m.Groups[0].Value, "sssp:" + m.Groups[1].Value);
}

//Beアイコン、絵文字リンクの成型http:→sssp:
//先頭行のリンク処理
if (Regex.IsMatch(temp, @"^\s(<img src=.(?:https?:)?(\/\/img\.(?:2|5)ch\.net.+?).>)(?:(\s)<br>)?"))
const string top_imglink_pattern = @"^ (<img src=.(?:https?:)?(\/\/img\.(?:2|5)ch\.net.+?).>)(?:( )<br>)?";
if (Regex.IsMatch(temp, top_imglink_pattern))
{
var mae = Regex.Match(temp, @"^\s(<img src=.(?:https?:)?(\/\/img\.(?:2|5)ch\.net.+?).>)(?:(\s)<br>)?").Groups;
//if (Regex.IsMatch(BuildDat.ToString(), @"BE:\d+"))
var mae = Regex.Match(temp, top_imglink_pattern).Groups;
if (be == true)
{
ResBody.Replace(mae[1].Value, "sssp:" + mae[2].Value);
Expand All @@ -817,6 +831,18 @@ private static String html2dat(String res)
ResBody.Replace(mae[1].Value, " sssp:" + mae[2].Value + mae[3].Value);
}
}
// 行頭のお絵描きリンク処理
const string top_oekakilink_pattern = @"^<br> <img src=.(//o.5ch.net/.+?)"">";
if (Regex.IsMatch(temp, top_oekakilink_pattern))
{
/*
* こんな感じの対応
* <section class="post-content"> <br> <img src="//o.5ch.net/20vzd.png"> </section>
* <> <br> sssp://o.5ch.net/20vzd.png <>
*/
var match = Regex.Match(temp, top_oekakilink_pattern).Groups;
ResBody.Replace(match[0].Value, $" <br> sssp:{match[1].Value} ");
}

//その他本文中リンク処理
t = Regex.Matches(temp, @"(?:<br>(\s))?(<img src=.(?:https?:)?(\/\/img\.(?:2|5)ch\.net.+?).>)(?:(\s)<br>)?");
Expand Down
13 changes: 13 additions & 0 deletions doc/donguri/html/testdata.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<article id="2" data-date="NG" data-userid="ID:snSCSwWG0" data-id="2" class="clear post"><details open class="post-header"><summary><span class="postid">0002</span><span class="postusername"><b>名無しさん@お腹いっぱい。</b></span><span style="float:right; max-height: 1em;"><form action="https://donguri.5ch.net/confirm" target="_blank" accept-charset="UTF-8" method="GET"><a href="https://donguri.5ch.net" target="_blank">垢版</a> | <input type="hidden" name="url" value="//egg.5ch.net/test/read.cgi/software/1712375061/"><input type="hidden" name="date" value="2024/04/06(土) 13:03:07.40"><button type="submit" style="border: none; background: none; cursor: pointer; font-size: 1em; color:#9b4dca;">大砲</button></form></span></summary><span style="width:100%;"><span class="date">2024/04/06(土) 13:03:07.40</span><span class="uid">ID:snSCSwWG0</span></span></details><section class="post-content"> <a href="../test/read.cgi/software/1712375061/1" rel="noopener noreferrer" target="_blank" class="reply_link">&gt;&gt;1</a>乙 </section></article>

<article id="5" data-date="NG" data-userid="ID:GhADKlpu0" data-id="5" class="clear post"><details open class="post-header"><summary><span class="postid">0005</span><span class="postusername"><b><a href="mailto:sage">名無しさん@お腹いっぱい。</b> <span style="color:green;">警備員[Lv.1][新][苗]</span><b></a></b></span><span style="float:right; max-height: 1em;"><form action="https://donguri.5ch.net/confirm" target="_blank" accept-charset="UTF-8" method="GET"><a href="https://donguri.5ch.net" target="_blank">垢版</a> | <input type="hidden" name="url" value="//egg.5ch.net/test/read.cgi/software/1712375061/"><input type="hidden" name="date" value="2024/04/06(土) 22:59:52.12"><button type="submit" style="border: none; background: none; cursor: pointer; font-size: 1em; color:#9b4dca;">大砲</button></form></span></summary><span style="width:100%;"><span class="date">2024/04/06(土) 22:59:52.12</span><span class="uid">ID:GhADKlpu0</span></span></details><section class="post-content"> 串外さないと書けないみたいね </section></article>

<article id="22" data-date="NG" data-userid="ID:plCuGxBZ0" data-id="22" class="clear post"><details open class="post-header"><summary><span class="postid">0022</span><span class="postusername"><b>名無しさん@お腹いっぱい。</b> <span style="color:green;">ころころ</span><b></b></span><span style="float:right; max-height: 1em;"><form action="https://donguri.5ch.net/confirm" target="_blank" accept-charset="UTF-8" method="GET"><a href="https://donguri.5ch.net" target="_blank">垢版</a> | <input type="hidden" name="url" value="//egg.5ch.net/test/read.cgi/software/1712375061/"><input type="hidden" name="date" value="2024/04/12(金) 00:48:28.91"><button type="submit" style="border: none; background: none; cursor: pointer; font-size: 1em; color:#9b4dca;">大砲</button></form></span></summary><span style="width:100%;"><span class="date">2024/04/12(金) 00:48:28.91</span><span class="uid">ID:plCuGxBZ0</span></span></details><section class="post-content"> うらやましい生き方してんな </section></article>

<article id="183" data-date="NG" data-userid="ID:AJIIs3rB0" data-id="183" class="clear post"><details open class="post-header"><summary><span class="postid">0183</span><span class="postusername"><b>名無しさん@お腹いっぱい。</b></span><span style="float:right; max-height: 1em;"><form action="https://donguri.5ch.net/confirm" target="_blank" accept-charset="UTF-8" method="GET"><a href="https://donguri.5ch.net" target="_blank">垢版</a> | <input type="hidden" name="url" value="//egg.5ch.net/test/read.cgi/software/1712375061/"><input type="hidden" name="date" value="2024/04/22(月) 06:14:25.04"><button type="submit" style="border: none; background: none; cursor: pointer; font-size: 1em; color:#9b4dca;">大砲</button></form></span></summary><span style="width:100%;"><span class="date">2024/04/22(月) 06:14:25.04</span><span class="uid">ID:AJIIs3rB0</span></span></details><section class="post-content"> アイスタ下にこびり付くか </section></article>

<article id="1" data-date="NG" data-userid="ID:gRjAjO1l9" data-id="1" class="clear post"><details open class="post-header"><summary><span class="postid">0001</span><span class="postusername"><b><font color="#fc0584">首都圏の虎 ★</font></b></span><span style="float:right; max-height: 1em;"><form action="https://donguri.5ch.net/confirm" target="_blank" accept-charset="UTF-8" method="GET"><a href="https://donguri.5ch.net" target="_blank">垢版</a> | <input type="hidden" name="url" value="//asahi.5ch.net/test/read.cgi/newsplus/1714032617/"><input type="hidden" name="date" value="2024/04/25(木) 17:10:17.26"><button type="submit" style="border: none; background: none; cursor: pointer; font-size: 1em; color:#9b4dca;">大砲</button></form></span></summary><span style="width:100%;"><span class="date">2024/04/25(木) 17:10:17.26</span><span class="uid">ID:gRjAjO1l9</span></span></details><section class="post-content">  大阪府の吉村洋文知事は25日の記者会見で、人口減少を食い止める政策実現のため、0歳児から選挙権を認めるべきだとの考えを示した。「子どもが政治的影響を持つ仕組みになれば、政治家もその方向を向く」と述べ、転換を促す必要性を指摘。市町村の4割超に「消滅可能性」があるとする民間組織の報告書公表に関する質問に答えた。 <br> <br> 全文はソースで 最終更新:4/25(木) 16:59 <br> <a href="http://jump.5ch.net/?https://news.yahoo.co.jp/articles/d1f38afdab927d6535aee550e6d86176cf242df4" target="_blank">https://news.yahoo.co.jp/articles/d1f38afdab927d6535aee550e6d86176cf242df4</a> </section></article>

<article id="68" data-date="NG" data-userid="ID:YO3B1XQ+r" data-id="68" class="clear post"><details open class="post-header"><summary><span class="postid">0068</span><span class="postusername"><b>番組の途中ですがアフィサイトへの転載は禁止です</b> <span style="color:green;">警備員[Lv.7][新][苗][芽]</span><b> </b>(オッペケ Srea-/G3D)<b></b></span><span style="float:right; max-height: 1em;"><form action="https://donguri.5ch.net/confirm" target="_blank" accept-charset="UTF-8" method="GET"><a href="https://donguri.5ch.net" target="_blank">垢版</a> | <input type="hidden" name="url" value="//greta.5ch.net/test/read.cgi/poverty/1714034581/"><input type="hidden" name="date" value="2024/04/25(木) 17:49:10.45"><button type="submit" style="border: none; background: none; cursor: pointer; font-size: 1em; color:#9b4dca;">大砲</button></form></span></summary><span style="width:100%;"><span class="date">2024/04/25(木) 17:49:10.45</span><span class="uid">ID:YO3B1XQ+r</span></span></details><section class="post-content"> 文明の終焉って感じで感慨深いな <br> 後はメギドの火を待つばかりか </section></article>

<article id="71" data-date="NG" data-userid="ID:JroisIg/0" data-id="71" class="clear post"><details open class="post-header"><summary><span class="postid">0071</span><span class="postusername"><b>顔デカ</b> <span style="color:green;">警備員[Lv.12(前19)][苗][芽]</span><b> </b>(ワッチョイW ac0d-89Xz)<b></b></span><span style="float:right; max-height: 1em;"><form action="https://donguri.5ch.net/confirm" target="_blank" accept-charset="UTF-8" method="GET"><a href="https://donguri.5ch.net" target="_blank">垢版</a> | <input type="hidden" name="url" value="//greta.5ch.net/test/read.cgi/poverty/1714034581/"><input type="hidden" name="date" value="2024/04/25(木) 17:49:29.37"><button type="submit" style="border: none; background: none; cursor: pointer; font-size: 1em; color:#9b4dca;">大砲</button></form></span></summary><span style="width:100%;"><span class="date">2024/04/25(木) 17:49:29.37</span><span class="uid">ID:JroisIg/0</span></span><span class="be r2BP"><a href="http://be.5ch.net/user/681125504" target="_blank">?2BP(1000)</a></span></details><section class="post-content"> <img src="//img.5ch.net/ico/nida.gif"> <br> ゆっくりバイデンだぜ! <br> 今日はイスラエルの偉大さを解説するんだぜ! </section></article>
Loading

0 comments on commit 54d7444

Please sign in to comment.