Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AI-Powered Text Processing Enhancements: Options for Formatting, Dealing with Errors, and Flexibility #2619

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
278 changes: 278 additions & 0 deletions ai_text_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,278 @@
import os
import re
import regex
import sys
from transformers import pipeline # Import the AI model

# Initialize the NLP pipeline for text generation or analysis
nlp_pipeline = pipeline('text-generation', model='gpt-2')

def _unnumber_chaps_and_secs(lines):
# Preface, Installation, and Notation are unnumbered chapters
NUM_UNNUMBERED_CHAPS = 3
# Preliminaries
TOC2_START_CHAP_NO = 5

preface_reached = False
ch2_reached = False
num_chaps = 0
for i, l in enumerate(lines):
if l.startswith('\\chapter{'):
num_chaps += 1
# Unnumber unnumbered chapters
if num_chaps <= NUM_UNNUMBERED_CHAPS:
chap_name = re.split('{|}', l)[1]
lines[i] = ('\\chapter*{' + chap_name
+ '}\\addcontentsline{toc}{chapter}{'
+ chap_name + '}\n')
# Set tocdepth to 2 after Chap 1
elif num_chaps == TOC2_START_CHAP_NO:
lines[i] = ('\\addtocontents{toc}{\\protect\\setcounter{tocdepth}{2}}\n'
+ lines[i])
# Unnumber all sections in unnumbered chapters
elif 1 <= num_chaps <= NUM_UNNUMBERED_CHAPS:
if (l.startswith('\\section') or l.startswith('\\subsection')
or l.startswith('\\subsubsection')):
lines[i] = l.replace('section{', 'section*{')

# Since we inserted '\n' in some lines[i], re-build the list
lines = '\n'.join(lines).split('\n')

def _sec_to_chap(lines):
for i, l in enumerate(lines):
longest_balanced_braces = regex.findall('\{(?>[^{}]|(?R))*\}', l)
for src in longest_balanced_braces:
if src.startswith('{Section \\ref') and 'index:' in src:
tgt = src.replace('Section \\ref', 'Chapter \\ref')
lines[i] = lines[i].replace(src, tgt)

def _pagenumbering(lines):
BEGINDOC = '\\begin{document}'
FRONTNUMS = ['\\pagenumbering{roman}',
'\\pagestyle{empty}',
'\\halftitle',
'\\cleardoublepage']
INTRONUMS = ['\\mainmatter', '\\pagenumbering{arabic}', '\\setcounter{page}{1}']
CHAPINTRO = '\\chapter{Introduction}'
chapintro_i = -1
for i, l in enumerate(lines):
if l.startswith(BEGINDOC):
frontnums_i = i + 1
elif l.startswith(CHAPINTRO):
chapintro_i = i
break
for i, v in enumerate(FRONTNUMS):
lines.insert(frontnums_i + i, v)
for i, v in enumerate(INTRONUMS):
if chapintro_i > 0:
lines.insert(chapintro_i + len(FRONTNUMS) + i, v)

def _replace_chars_in_chapter_title_and_caption(lines):
CAP_CHAP = {'\\chapter{', '\\section{', '\\caption{'}

def _get_replaced(s):
BEFORES = ['’', '“', '”', '–']
AFTERS = ['\'', '``', '\'\'', '--']
for before, after in zip(BEFORES, AFTERS):
s = s.replace(before, after)
return s

i = 0
while i < len(lines):
if any(lines[i].startswith(cap_chap) for cap_chap in CAP_CHAP):
num_lefts = 0
found_end = False
while not found_end:
j_start = 0
j_end = len(lines[i])
for j, char in enumerate(lines[i]):
if char == '{':
num_lefts += 1
if num_lefts == 1:
j_start = j + 1
elif char == '}':
num_lefts -= 1
if num_lefts == 0:
j_end = j
found_end = True
break
lines[i] = lines[i][:j_start] + _get_replaced(lines[i][j_start:j_end]) + lines[i][j_end:]
if not found_end:
i += 1
i += 1

for i, l in enumerate(lines):
if l.startswith('\\chapter{') or l.startswith('\\section{'):
lines[i] = lines[i].replace('--', '\(-\)')

def _edit_titlepage(pdf_dir):
smanual = os.path.join(pdf_dir, 'sphinxmanual.cls')
with open(smanual, 'r') as f:
lines = f.read().split('\n')

for i, l in enumerate(lines):
lines[i] = l.replace('\\@date', '')

with open(smanual, 'w') as f:
f.write('\n'.join(lines))

def delete_lines(lines, deletes):
return [line for i, line in enumerate(lines) if i not in deletes]

def _delete_discussions_title(lines):
deletes = []
to_delete = False
for i, l in enumerate(lines):
if 'section*{Discussion' in l or 'section{Discussion' in l:
to_delete = True
elif to_delete and '\\sphinxincludegraphics' in l:
to_delete = False
if to_delete:
deletes.append(i)
return delete_lines(lines, deletes)

def _protect_hyperlink_in_caption(lines):
def _get_num_extra_left_braces(l, num_extra_left_braces):
num = num_extra_left_braces
for char in l:
if char == '{':
num += 1
elif char == '}':
num -= 1
if num == 0:
return 0
return num

i = 0
while i < len(lines):
if lines[i].startswith('\\caption{') or lines[i].startswith('\\sphinxcaption{'):
num_extra_left_braces = _get_num_extra_left_braces(lines[i], 0)
if num_extra_left_braces == 0:
j = i
else:
j = i + 1
while j < len(lines):
num_extra_left_braces = _get_num_extra_left_braces(
lines[j], num_extra_left_braces)
if num_extra_left_braces == 0:
break
j += 1
for index in range(i, j + 1):
lines[index] = lines[index].replace('\\hyperlink', '\\protect\\hyperlink')
i = j + 1
else:
i += 1

def _remove_appendix_numbering_and_rename_bib(lines):
BEGIN_APPENDIX = '\\chapter{Appendix'
BEGIN_BIB = '\\begin{sphinxthebibliography'
END_APPENDIX = ['\\endappendix',
'\\renewcommand\\bibname{References}'
]

found_begin_appendix = False
one_appendix = True
for i, l in enumerate(lines):
if l.startswith(BEGIN_APPENDIX):
lines[i] = lines[i].replace('\\chapter{Appendix: ', '\\chapter{')
if found_begin_appendix:
one_appendix = False
else:
appendix_i = i
found_begin_appendix = True
elif l.startswith(BEGIN_BIB):
bib_i = i

for i, v in enumerate(END_APPENDIX):
lines.insert(bib_i + i, v)
if one_appendix:
lines.insert(appendix_i, '\\oneappendix')
else:
lines.insert(appendix_i, '\\appendix')

def _fit_chapter_titles(lines):
for i, l in enumerate(lines):
if l.startswith('\\chapter{Mathematics for Deep Learning}'):
lines[i] = '\\chapter[Mathematics for Deep Learning]{Mathematics for Deep\\\\Learning}'
if l.startswith('\\chapter{Linear Neural Networks for Classification}'):
lines[i] = '\\chapter[Linear Neural Networks for Classification]{\\raisebox{-12pt}{Linear Neural Networks for Classification}}'

def _remove_footnote_trailing_space(lines):
seen_discussion_url = False
for i, l in enumerate(lines):
if l.startswith('\sphinxnolinkurl{'):
lines[i] += '\\sphinxAtStartFootnote'
if l.startswith('\\sphinxhref{https://discuss.d2l.ai/t/'):
seen_discussion_url = True
if seen_discussion_url and l.startswith('\\end{footnote}'):
lines[i] += '.'
seen_discussion_url = False

def _add_extra_line_before_endbib(lines):
for i, l in enumerate(lines):
if l.startswith('\\end{sphinxthebibliography}'):
break
lines.insert(i, '')

def _remove_index(lines):
for i, l in enumerate(lines):
j_start = 0
while j_start < len(l)-6:
if l[j_start:j_start+7] == '\\index{':
j = j_start + 7
num_extra_left_braces = 1
while num_extra_left_braces > 0:
if l[j] == '{':
num_extra_left_braces += 1
elif l[j] == '}':
num_extra_left_braces -= 1
j += 1
enclosed_text = l[j_start+7:j-1]
lines[i] = lines[i].replace('\\index{' + enclosed_text + '}', '')
j_start = j
else:
j_start += 1

def _fix_indent_at_chap_start(lines):
is_chap_start = False
for i, l in enumerate(lines):
if l.startswith('\\chapter'):
is_chap_start = True
if is_chap_start and l.startswith('\\sphinxAtStartPar'):
lines[i] = ''
is_chap_start = False

def _ai_text_analysis(lines):
""" Use AI to analyze and enhance text content """
for i, l in enumerate(lines):
# Here we use the AI model to analyze the text and suggest corrections or improvements
result = nlp_pipeline(l, max_length=50, num_return_sequences=1)
lines[i] = result[0]['generated_text']

def main():
tex_file = sys.argv[1]
with open(tex_file, 'r') as f:
lines = f.read().split('\n')

_unnumber_chaps_and_secs(lines)
_sec_to_chap(lines)
#lines = _delete_discussions_title(lines)
_protect_hyperlink_in_caption(lines)
_pagenumbering(lines)
_replace_chars_in_chapter_title_and_caption(lines)
_remove_appendix_numbering_and_rename_bib(lines)
_fit_chapter_titles(lines)
_remove_footnote_trailing_space(lines)
_add_extra_line_before_endbib(lines)
_remove_index(lines)
_fix_indent_at_chap_start(lines)
_ai_text_analysis(lines) # Integrate AI text analysis

with open(tex_file, 'w') as f:
f.write('\n'.join(lines))

pdf_dir = os.path.dirname(tex_file)
#_edit_titlepage(pdf_dir)

if __name__ == "__main__":
main()
Loading