forked from jdmonaco/pdf-title-rename
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdocx-rename-batch.py
115 lines (91 loc) · 3.58 KB
/
docx-rename-batch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python
"""
Author: Autoz
Extract title from word:docx file.
Depends on: docx2txt.
Origin from: [pdftitle.py](https://gist.github.com/hanjianwei/6838974)
Usage:
post-docx-batch.py *.docx
"""
import io # cStringIO in python2
import getopt
import os
import re
import string
import sys
import docx2txt
__all__ = ['pdf_title']
def check_contain_chinese(check_str):
return any((u'\u4e00' <= char <= u'\u9fff') for char in check_str)
def check_contain_number(check_str):
return any(char.isdigit() for char in check_str)
def sanitize(filename):
"""Turn string to valid file name.
"""
valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
return ''.join([c for c in filename if c in valid_chars])
def sanitize_chinese(filename):
return re.sub('\?|\.|\。|\!|\/|\;|\:|\*|\>|\<|\~|\(|\)|\[|\]|[A-Za-z0-9]|', '', filename)
def copyright_line(line):
"""Judge if a line is copyright info.
"""
return re.search(r'technical\s+report|proceedings|preprint|to\s+appear|submission', line.lower())
def empty_str(s):
return len(s.strip()) == 0
def title_start_end_postal(lines,search_lines=30,tag=''):
filted_lines = []; find_tag = False;
search_lines = search_lines if search_lines > len(lines) else len(lines)
for i, line in enumerate(lines[:search_lines]):
if not empty_str(line):
if check_contain_chinese(line): # user name or chinese title
if tag in line and check_contain_number(line): # tag and numbers as doc's No.
filted_lines.append(line.strip())
print('=cut line=%s'% line.strip())
find_tag = True
break
filted_lines.append(line.strip())
print('=cut line=%s'% line.strip())
continue
return filted_lines,find_tag
def docx_title(filename,tag):
text = docx2txt.process(filename)
lines,find_tag = title_start_end_postal(text,search_lines=30,tag=tag)
title = ('_'.join(line.strip() for line in lines).replace(' ','_'))
if not find_tag: title = title[:40]
if empty_str(title):
title = os.path.splitext(filename)[0]
print('Can not get info to rename.')
new_name = os.path.join('', title + os.path.splitext(filename)[1])
return new_name
if __name__ == "__main__":
def usage():
print("Usage: %s [--dry-run] [-p|--post-tag post_tag] filenames" % sys.argv[0])
input('There is no flie in root folder, please enter anykey to exit')
try:
opts, args = getopt.getopt(sys.argv[1:], 'np:', ['post-tag','dry-run'])
except getopt.GetoptError:
usage()
from glob import glob
#args = glob('*.docx') # for build in pyinstall to .exe apps with no arguments
#print('====docs====',args)
if not args: usage()
readall = False
dry_run = False
post_tag = '编号' # stop tag
for opt, arg in opts:
if opt in ['-a', '--readall']:
readall = True
elif opt in ['-n', '--dry-run']:
dry_run = True
elif opt in ['-p', '--post-tag']:
post_tag = arg
for filename in args:
new_name = docx_title(filename,post_tag)
print ("%s => %s" % (filename,new_name))
if not dry_run:
if(filename == new_name): continue
if os.path.exists(new_name):
new_name = os.path.splitext(new_name)[0] + "copy" + os.path.splitext(new_name)[1]
print("===Target already exists! Rename to %s" % new_name)
os.rename(filename, new_name)
input('Done, please enter anykey to exit')