From 0031828f258107e78b7d086d374549743ec7e33e Mon Sep 17 00:00:00 2001 From: Kun Jinkao <45487685+Snoopy1866@users.noreply.github.com> Date: Wed, 9 Oct 2024 11:23:23 +0800 Subject: [PATCH] fix: use escaped char instead of ASCII group try to partially fix #60 --- docs/ReadRTF.md | 6 +++--- gbk/MixCWFont.sas | 10 +++++----- gbk/ReadRTF.sas | 10 ++++++---- utf8/MixCWFont.sas | 10 +++++----- utf8/ReadRTF.sas | 8 ++++---- 5 files changed, 23 insertions(+), 21 deletions(-) diff --git a/docs/ReadRTF.md b/docs/ReadRTF.md index c64f7fe..975091e 100644 --- a/docs/ReadRTF.md +++ b/docs/ReadRTF.md @@ -120,7 +120,7 @@ OUTDATA = t_7_1_1 - 标题:`/\\outlinelevel\d/o` - 表头定义起始行:`/\\trowd\\trkeep\\trhdr\\trq[lcr]/o` - 表头属性定义行:`/\\clbrdr[tlbr]\\brdrs\\brdrw\d*\\brdrcf\d*(?:\\clbrdr[tlbr]\\brdrs\\brdrw\d*\\brdrcf\d*)*\\cltxlrt[bl]\\clvertal[tcb](?:\\clcbpat\d*)?\\cellx(\d+)/o` - - 数据行:`/^\\pard\\plain\\intbl(?:\\keepn)?\\sb\d*\\sa\d*\\q[lcr]\\f\d*\\fs\d*\\cf\d*\{((?:\\'[0-9A-F]{2}|\\u\d{1,5};|[[:ascii:]])*)\\cell\}$/o` + - 数据行:`/^\\pard\\plain\\intbl(?:\\keepn)?\\sb\d*\\sa\d*\\q[lcr]\\f\d*\\fs\d*\\cf\d*\{((?:\\'[0-9A-F]{2}|\\u\d{1,5};|[\x20-\x7e])*)\\cell\}$/o` - 分节符标识行:`/\\sect\\sectd\\linex\d*\\endnhere\\pgwsxn\d*\\pghsxn\d*\\lndscpsxn\\headery\d*\\footery\d*\\marglsxn\d*\\margrsxn\d*\\margtsxn\d*\\margbsxn\d*/o` 5. 开始转换数据。调用 [Cell_Transcode](Transcode.md#cell_transcode) 函数,将单元格内的字符串转换为可读的字符串; 6. 使用 `PROC TRANSPOSE` 对上一步产生的数据集进行转置; @@ -219,14 +219,14 @@ RTF 文件单行字符串没有限制长度,为确保读取的 RTF 标记字 4. 使用以下正则表达式匹配数据行 ``` -^\\pard\\plain\\intbl(?:\\keepn)?\\sb\d*\\sa\d*\\q[lcr]\\f\d*\\fs\d*\\cf\d*\{((?:\\'[0-9A-F]{2}|\\u\d{1,5};|[[:ascii:]])*)\\cell\}$/o +^\\pard\\plain\\intbl(?:\\keepn)?\\sb\d*\\sa\d*\\q[lcr]\\f\d*\\fs\d*\\cf\d*\{((?:\\'[0-9A-F]{2}|\\u\d{1,5};|[\x20-\x7e])*)\\cell\}$/o ``` 上述正则表达式中,包含了 3 种类型的数据编码形式: - `\\'[0-9A-F]{2}` : GBK - `\\u\d{1,5};` : UTF-8 -- `[[:ascii:]]` : ASCII +- `[\x20-\x7e]` : ASCII 其中 GBK 和 UTF-8 字符是以转义字符表示的,需要进一步转换成以 SAS 当前环境下的编码存储的字符串。 diff --git a/gbk/MixCWFont.sas b/gbk/MixCWFont.sas index f127a57..233c84c 100644 --- a/gbk/MixCWFont.sas +++ b/gbk/MixCWFont.sas @@ -213,7 +213,7 @@ data _tmp_rtf_polish(compress = yes); set _tmp_rtf_font_added; - reg_header_cell_id = prxparse("/\\pard\\plain\\intbl(?:\\keepn)?\\sb\d*\\sa\d*\\q[lcr]\\f\d*\\fs\d*\\cf\d*\{((?:\\\x27[0-9A-F]{2}|\\u\d{1,5};|[[:ascii:]])+)\{\\line\}/o"); + reg_header_cell_id = prxparse("/\\pard\\plain\\intbl(?:\\keepn)?\\sb\d*\\sa\d*\\q[lcr]\\f\d*\\fs\d*\\cf\d*\{((?:\\\x27[0-9A-F]{2}|\\u\d{1,5};|[\x20-\x7e])+)\{\\line\}/o"); length tmp_line $32767; retain tmp_line; @@ -252,8 +252,8 @@ length context_mixed $32767; /*޸ĵԪı*/ - reg_cell_id = prxparse("/\\pard\\plain\\intbl(?:\\keepn)?\\sb\d*\\sa\d*\\q[lcr]\\f\d*\\fs\d*\\cf\d*\{((?:\\\x27[0-9A-F]{2}|\\u\d{1,5};|[[:ascii:]])+)\\cell\}/o"); - reg_cell_inside_id = prxparse("/\\animtext\d*\\ul\d*\\strike\d*\\b\d*\\i\d*\\f\d*\\fs\d*\\cf\d*((?:\\\x27[0-9A-F]{2}|\\u\d{1,5};|[[:ascii:]])+)/o"); + reg_cell_id = prxparse("/\\pard\\plain\\intbl(?:\\keepn)?\\sb\d*\\sa\d*\\q[lcr]\\f\d*\\fs\d*\\cf\d*\{((?:\\\x27[0-9A-F]{2}|\\u\d{1,5};|[\x20-\x7e])+)\\cell\}/o"); + reg_cell_inside_id = prxparse("/\\animtext\d*\\ul\d*\\strike\d*\\b\d*\\i\d*\\f\d*\\fs\d*\\cf\d*((?:\\\x27[0-9A-F]{2}|\\u\d{1,5};|[\x20-\x7e])+)/o"); reg_cell_change_font_id = prxparse("s/(?!<\\f&cfont_id )((?:\\\x27[0-9A-F]{2}|\\u\d{1,5};)+)/\\f&cfont_id $1\\f&wfont_id /o"); if prxmatch(reg_cell_id, trim(line)) then do; call prxposn(reg_cell_id, 1, st, len); @@ -274,7 +274,7 @@ end; /*޸ıı*/ - reg_outllv_id = prxparse("/\\outlinelevel\d*\{((?:\\\x27[0-9A-F]{2}|\\u\d{1,5};|[[:ascii:]])+)\}/o"); + reg_outllv_id = prxparse("/\\outlinelevel\d*\{((?:\\\x27[0-9A-F]{2}|\\u\d{1,5};|[\x20-\x7e])+)\}/o"); reg_outlnlv_change_font_id = prxparse("s/(?!<\\f&cfont_id )((?:\\\x27[0-9A-F]{2}|\\u\d{1,5};)+)/\\f&cfont_id $1\\f&wfont_id /o"); if prxmatch(reg_outllv_id, trim(line)) then do; @@ -291,7 +291,7 @@ end; /*޸Ľעı*/ - reg_ftnt_id = prxparse("/\\pard\\b\d*\\i\d*\\chcbpat\d*\\q[lcr]\\f\d*\\fs\d*\\cf\d*\{\}\\q[lcr]\\fs\d*((?:\\\x27[0-9A-F]{2}|\\u\d{1,5};|[[:ascii:]])+)\\cf\d*\\chcbpat\d*/o"); + reg_ftnt_id = prxparse("/\\pard\\b\d*\\i\d*\\chcbpat\d*\\q[lcr]\\f\d*\\fs\d*\\cf\d*\{\}\\q[lcr]\\fs\d*((?:\\\x27[0-9A-F]{2}|\\u\d{1,5};|[\x20-\x7e])+)\\cf\d*\\chcbpat\d*/o"); reg_ftnt_change_font_id = prxparse("s/(?!<\\f&cfont_id )((?:\\\x27[0-9A-F]{2}|\\u\d{1,5};)+)/\\f&cfont_id $1\\f&wfont_id /o"); if prxmatch(reg_ftnt_id, trim(line)) then do; diff --git a/gbk/ReadRTF.sas b/gbk/ReadRTF.sas index 78c862f..90a04ad 100644 --- a/gbk/ReadRTF.sas +++ b/gbk/ReadRTF.sas @@ -94,6 +94,8 @@ options cmplib = sasuser.func; data _tmp_rtf_data_polish_header(compress = &compress); set _tmp_rtf_data; + len = length(line); + length break_line $32767.; reg_header_break_id = prxparse("/^(\\pard\\plain\\intbl\\keepn\\sb\d*\\sa\d*\\q[lcr]\\f\d*\\fs\d*\\cf\d*\{.*){\\line}$/o"); @@ -132,9 +134,9 @@ options cmplib = sasuser.func; length line_data_part $32767 line_data_part_buffer $32767; - reg_data_line_start_id = prxparse("/^\\pard\\plain\\intbl(?:\\keepn)?\\sb\d*\\sa\d*\\q[lcr]\\f\d*\\fs\d*\\cf\d*\{((?:\\'[0-9A-F]{2}|\\u\d{1,5};|[[:ascii:]])*)$/o"); - reg_data_line_mid_id = prxparse("/^((?:\\'[0-9A-F]{2}|\\u\d{1,5};|[[:ascii:]])*)$/o"); - reg_data_line_end_id = prxparse("/^((?:\\'[0-9A-F]{2}|\\u\d{1,5};|[[:ascii:]])*)\\cell\}$/o"); + reg_data_line_start_id = prxparse("/^\\pard\\plain\\intbl(?:\\keepn)?\\sb\d*\\sa\d*\\q[lcr]\\f\d*\\fs\d*\\cf\d*\{((?:\\'[0-9A-F]{2}|\\u\d{1,5};|[\x20-\x7e])*)$/o"); + reg_data_line_mid_id = prxparse("/^((?:\\'[0-9A-F]{2}|\\u\d{1,5};|[\x20-\x7e])*)$/o"); + reg_data_line_end_id = prxparse("/^((?:\\'[0-9A-F]{2}|\\u\d{1,5};|[\x20-\x7e])*)\\cell\}$/o"); retain line_data_part ""; retain line_data_part_found 0; @@ -229,7 +231,7 @@ options cmplib = sasuser.func; reg_outlinelevel_id = prxparse("/\\outlinelevel\d/o"); reg_header_line_id = prxparse("/\\trowd\\trkeep\\trhdr\\trq[lcr]/o"); reg_header_def_line_id = prxparse("/\\clbrdr[tlbr]\\brdrs\\brdrw\d*\\brdrcf\d*(?:\\clbrdr[tlbr]\\brdrs\\brdrw\d*\\brdrcf\d*)*\\cltxlrt[bl]\\clvertal[tcb](?:\\clcbpat\d*)?\\cellx(\d+)/o"); - reg_data_line_id = prxparse("/^\\pard\\plain\\intbl(?:\\keepn)?\\sb\d*\\sa\d*\\q[lcr]\\f\d*\\fs\d*\\cf\d*\{((?:\\'[0-9A-F]{2}|\\u\d{1,5};|[[:ascii:]])*)\\cell\}$/o"); + reg_data_line_id = prxparse("/^\\pard\\plain\\intbl(?:\\keepn)?\\sb\d*\\sa\d*\\q[lcr]\\f\d*\\fs\d*\\cf\d*\{((?:\\'[0-9A-F]{2}|\\u\d{1,5};|[\x20-\x7e])*)\\cell\}$/o"); reg_sect_line_id = prxparse("/\\sect\\sectd\\linex\d*\\endnhere\\pgwsxn\d*\\pghsxn\d*\\lndscpsxn\\headery\d*\\footery\d*\\marglsxn\d*\\margrsxn\d*\\margtsxn\d*\\margbsxn\d*/o"); diff --git a/utf8/MixCWFont.sas b/utf8/MixCWFont.sas index 67d074d..d3b3c4d 100644 --- a/utf8/MixCWFont.sas +++ b/utf8/MixCWFont.sas @@ -213,7 +213,7 @@ data _tmp_rtf_polish(compress = yes); set _tmp_rtf_font_added; - reg_header_cell_id = prxparse("/\\pard\\plain\\intbl(?:\\keepn)?\\sb\d*\\sa\d*\\q[lcr]\\f\d*\\fs\d*\\cf\d*\{((?:\\\x27[0-9A-F]{2}|\\u\d{1,5};|[[:ascii:]])+)\{\\line\}/o"); + reg_header_cell_id = prxparse("/\\pard\\plain\\intbl(?:\\keepn)?\\sb\d*\\sa\d*\\q[lcr]\\f\d*\\fs\d*\\cf\d*\{((?:\\\x27[0-9A-F]{2}|\\u\d{1,5};|[\x20-\x7e])+)\{\\line\}/o"); length tmp_line $32767; retain tmp_line; @@ -252,8 +252,8 @@ length context_mixed $32767; /*修改单元格文本字体*/ - reg_cell_id = prxparse("/\\pard\\plain\\intbl(?:\\keepn)?\\sb\d*\\sa\d*\\q[lcr]\\f\d*\\fs\d*\\cf\d*\{((?:\\\x27[0-9A-F]{2}|\\u\d{1,5};|[[:ascii:]])+)\\cell\}/o"); - reg_cell_inside_id = prxparse("/\\animtext\d*\\ul\d*\\strike\d*\\b\d*\\i\d*\\f\d*\\fs\d*\\cf\d*((?:\\\x27[0-9A-F]{2}|\\u\d{1,5};|[[:ascii:]])+)/o"); + reg_cell_id = prxparse("/\\pard\\plain\\intbl(?:\\keepn)?\\sb\d*\\sa\d*\\q[lcr]\\f\d*\\fs\d*\\cf\d*\{((?:\\\x27[0-9A-F]{2}|\\u\d{1,5};|[\x20-\x7e])+)\\cell\}/o"); + reg_cell_inside_id = prxparse("/\\animtext\d*\\ul\d*\\strike\d*\\b\d*\\i\d*\\f\d*\\fs\d*\\cf\d*((?:\\\x27[0-9A-F]{2}|\\u\d{1,5};|[\x20-\x7e])+)/o"); reg_cell_change_font_id = prxparse("s/(?!<\\f&cfont_id )((?:\\\x27[0-9A-F]{2}|\\u\d{1,5};)+)/\\f&cfont_id $1\\f&wfont_id /o"); if prxmatch(reg_cell_id, trim(line)) then do; call prxposn(reg_cell_id, 1, st, len); @@ -274,7 +274,7 @@ end; /*修改标题文本字体*/ - reg_outllv_id = prxparse("/\\outlinelevel\d*\{((?:\\\x27[0-9A-F]{2}|\\u\d{1,5};|[[:ascii:]])+)\}/o"); + reg_outllv_id = prxparse("/\\outlinelevel\d*\{((?:\\\x27[0-9A-F]{2}|\\u\d{1,5};|[\x20-\x7e])+)\}/o"); reg_outlnlv_change_font_id = prxparse("s/(?!<\\f&cfont_id )((?:\\\x27[0-9A-F]{2}|\\u\d{1,5};)+)/\\f&cfont_id $1\\f&wfont_id /o"); if prxmatch(reg_outllv_id, trim(line)) then do; @@ -291,7 +291,7 @@ end; /*修改脚注文本字体*/ - reg_ftnt_id = prxparse("/\\pard\\b\d*\\i\d*\\chcbpat\d*\\q[lcr]\\f\d*\\fs\d*\\cf\d*\{\}\\q[lcr]\\fs\d*((?:\\\x27[0-9A-F]{2}|\\u\d{1,5};|[[:ascii:]])+)\\cf\d*\\chcbpat\d*/o"); + reg_ftnt_id = prxparse("/\\pard\\b\d*\\i\d*\\chcbpat\d*\\q[lcr]\\f\d*\\fs\d*\\cf\d*\{\}\\q[lcr]\\fs\d*((?:\\\x27[0-9A-F]{2}|\\u\d{1,5};|[\x20-\x7e])+)\\cf\d*\\chcbpat\d*/o"); reg_ftnt_change_font_id = prxparse("s/(?!<\\f&cfont_id )((?:\\\x27[0-9A-F]{2}|\\u\d{1,5};)+)/\\f&cfont_id $1\\f&wfont_id /o"); if prxmatch(reg_ftnt_id, trim(line)) then do; diff --git a/utf8/ReadRTF.sas b/utf8/ReadRTF.sas index 9f27944..266104d 100644 --- a/utf8/ReadRTF.sas +++ b/utf8/ReadRTF.sas @@ -132,9 +132,9 @@ options cmplib = sasuser.func; length line_data_part $32767 line_data_part_buffer $32767; - reg_data_line_start_id = prxparse("/^\\pard\\plain\\intbl(?:\\keepn)?\\sb\d*\\sa\d*\\q[lcr]\\f\d*\\fs\d*\\cf\d*\{((?:\\'[0-9A-F]{2}|\\u\d{1,5};|[[:ascii:]])*)$/o"); - reg_data_line_mid_id = prxparse("/^((?:\\'[0-9A-F]{2}|\\u\d{1,5};|[[:ascii:]])*)$/o"); - reg_data_line_end_id = prxparse("/^((?:\\'[0-9A-F]{2}|\\u\d{1,5};|[[:ascii:]])*)\\cell\}$/o"); + reg_data_line_start_id = prxparse("/^\\pard\\plain\\intbl(?:\\keepn)?\\sb\d*\\sa\d*\\q[lcr]\\f\d*\\fs\d*\\cf\d*\{((?:\\'[0-9A-F]{2}|\\u\d{1,5};|[\x20-\x7e])*)$/o"); + reg_data_line_mid_id = prxparse("/^((?:\\'[0-9A-F]{2}|\\u\d{1,5};|[\x20-\x7e])*)$/o"); + reg_data_line_end_id = prxparse("/^((?:\\'[0-9A-F]{2}|\\u\d{1,5};|[\x20-\x7e])*)\\cell\}$/o"); retain line_data_part ""; retain line_data_part_found 0; @@ -229,7 +229,7 @@ options cmplib = sasuser.func; reg_outlinelevel_id = prxparse("/\\outlinelevel\d/o"); reg_header_line_id = prxparse("/\\trowd\\trkeep\\trhdr\\trq[lcr]/o"); reg_header_def_line_id = prxparse("/\\clbrdr[tlbr]\\brdrs\\brdrw\d*\\brdrcf\d*(?:\\clbrdr[tlbr]\\brdrs\\brdrw\d*\\brdrcf\d*)*\\cltxlrt[bl]\\clvertal[tcb](?:\\clcbpat\d*)?\\cellx(\d+)/o"); - reg_data_line_id = prxparse("/^\\pard\\plain\\intbl(?:\\keepn)?\\sb\d*\\sa\d*\\q[lcr]\\f\d*\\fs\d*\\cf\d*\{((?:\\'[0-9A-F]{2}|\\u\d{1,5};|[[:ascii:]])*)\\cell\}$/o"); + reg_data_line_id = prxparse("/^\\pard\\plain\\intbl(?:\\keepn)?\\sb\d*\\sa\d*\\q[lcr]\\f\d*\\fs\d*\\cf\d*\{((?:\\'[0-9A-F]{2}|\\u\d{1,5};|[\x20-\x7e])*)\\cell\}$/o"); reg_sect_line_id = prxparse("/\\sect\\sectd\\linex\d*\\endnhere\\pgwsxn\d*\\pghsxn\d*\\lndscpsxn\\headery\d*\\footery\d*\\marglsxn\d*\\margrsxn\d*\\margtsxn\d*\\margbsxn\d*/o");