-
Notifications
You must be signed in to change notification settings - Fork 3
/
grep2awk
401 lines (363 loc) · 13.5 KB
/
grep2awk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
# grep2awk:
# A zle convenience function to exchange a `grep` on the command line
# into a corresponding `awk`-line.
# How to use:
# Drop this file somewhere in your $fpath. And configure your shell to
# make use of it:
#
# autoload grep2awk
# zle -N grep2awk
# bindkey "^X^A" grep2awk
#
# This makes the widget available under <CTRL-X><CTRL-A>.
#
# The files t/comptest and t/ztst.zsh are taken (and modified) from the
# zsh project. This project can be used under the same terms as the zsh
# project.
#
# By Joep van Delft, 2015
emulate -L zsh
setopt extendedglob
autoload -Uz split-shell-arguments
grep2awk() {
# Needs to be in its own scope, to not have the variables leak to
# the main shell process.
local replacement_line REPLY REPLY2 flag cursor
local -a reply
typeset -A opt
# Define some functions first:
# - regex2awkprog
# - fixed2ere
# - bre2ere
# - ere2ere
# - swapcommand
regex2awkprog() {
local re prelude action prog awk
#zle -M "${(kv)opt}"
re=${(Q)1} # Remove one level of quoting, so we have what
# grep would receive.
(( opt[F] > 0 )) && {
re="$(fixed2ere $re)"
}
# A sanity check:
(( opt[E] + opt[F] + opt[G] + opt[P] > 1 )) && {
zle -M "grep2awk:Options E, F, G and P are mutually exclusive."
zle -M "Exiting."
return 1
}
(( opt[G] == 1 || opt[E] + opt[F] + opt[P] == 0 )) && {
# It is Basic Regular Expression
re="$(bre2ere $re)"
}
(( opt[E] > 0 )) && {
re="$(ere2ere $re)"
}
(( opt[w] > 0 )) && {
re="\\<($re)\\>"
}
(( opt[x] > 0 )) && {
re="^($re)$"
}
# Replace any forward slash with an escaped one:
re=${re//\//\\/}
# Wrap in slashes, and replace any single quote with its
# ASCII code:
re=/${re//\'/\\\\047}/
# Negate the pattern if told so:
(( opt[v] > 0 )) && {
re="!$re"
}
(( opt[i] + opt[y] > 0 )) && {
prelude+="BEGIN{IGNORECASE=1}; "
}
# The print action. Add some extra stuff if line count or file name
# are enabled.
action="{print ${opt[H]:+FILENAME\":\"}${opt[n]:+NR\":\"}\$0}"
# c, H, l and L are mutually exclusive. One of them will win.
(( opt[H] + opt[l] + opt[L] + opt[c] > 1 )) && {
zle -M "grep2awk:Options c, H, l and L are mutually exclusive."
zle -M "Exiting."
return 1
}
(( opt[c] > 0 )) && {
# This is incompatible with `grep -c`, as files where the count is 0
# are not displayed. This is saner behavior, IMHO.
prelude+='BEGINFILE{c=0};ENDFILE{if(c>0)print FILENAME":"c}; '
action='{c++}'
}
(( opt[l] > 0 )) && {
action="{print FILENAME; nextfile}"
}
(( opt[L] > 0 )) && {
prelude+="BEGINFILE{flag=0};ENDFILE{if (flag==0)print FILENAME}; "
action="{flag=1; nextfile}"
}
prog="${prelude}${re} $action"
printf -- ${(qq)prog}
}
fixed2ere() {
local p ere
# We already receive a backslash escaped version of the
# string by virtue of the ``(q)``-flag when this function is called.
# That means that we need to transform the characters in the
# first line to the corresponding character sequences in the
# second line:
#
# + | { } [ ] ( ) * . ? \ $ ^
# \+ \| \{ \} \[ \] \( \) \* \. \? \\ \$ \^
str=$1
ere=""
for i in {1..$#str}; do
if [[ $str[$i] =~ '\+|\||\{|\}|\[|\]|\(|\)|\*|\.|\?|\$|\^' ]]; then
ere+="\\$str[$i]"
elif [[ $str[$i] == $'\n' ]]; then
# This works after arrowing up into history, but not
# directly after writing it. Is there a way to detect
# if this is the case?
ere+='|'
elif [[ $str[$i] == '\' ]]; then
ere+="\\\\$str[$i]"
else
ere+="$str[$i]"
fi
done
printf -- "%s" "$ere"
}
bre2ere() {
# Basic Regular Expression to Extended Regular Expression
#
# What needs to be done:
# Translate BRE: \+,\{,\},\|,\(,\), +, {, }, |, (, )
# to ERE: +, {, }, |, (, ),\+,\{,\},\|,\(,\)
#
# But only if the backslash is not escaped with a backslash.
local bre ere i debug
# Variables tracking states:
local s_brack s_brack_start s_esc s_altern s_altern_start s_char_cls
bre=$1
ere=""
s_esc=off
s_altern=on
s_altern_start=off
s_brack=off
s_brack_start=off
s_char_cls=off
zstyle -s ':grep2awk:bre2ere:' debug debug
if [[ -n $debug ]]; then
rm "$debug" 2>/dev/null
debug() {
printf "bre[$i]=$bre[$i] s_esc=$s_esc " >> "$debug"
printf "s_brack=$s_brack s_brack_start=$s_brack_start " >> "$debug"
printf "s_altern=$s_altern s_alter_now=$s_altern_start\n" >> "$debug"
}
fi
for i in {1..$#bre}; do
[[ -n $debug ]] && debug
# Bracket expressions
# -------------------
if [[ $bre[$i] == '[' && $s_esc == off && $s_brack == off ]]; then
ere+="$bre[$i]"
s_brack=on
s_brack_start=on
# Character class accounting within bracket expressions, in order
# to keep track of the end of a bracket expression.
elif [[ $bre[$i] == '[' && $s_brack == on ]]; then
ere+="$bre[$i]"
s_char_cls=on
elif [[ $bre[$i] == ']' && $s_char_cls == on ]]; then
[[ $s_brack == on ]] || return 1
ere+="$bre[$i]"
s_char_cls=off
# Special casing the start of bracket expressions.
# s_brack_start allows for inclusion of ] as a character as per
# POSIX.2.
elif [[ $s_brack == on && $s_brack_start == on ]]; then
ere+="$bre[$i]"
[[ $bre[$i] == '^' ]] \
&& s_brack_start=on \
|| s_brack_start=off
elif [[ $bre[$i] == ']' && $s_brack == on ]]; then
[[ $s_char_cls == off ]] || return 1
ere+="$bre[$i]"
s_brack=off
# Finally, the default bracket expansion action:
elif [[ $s_brack == on ]]; then
ere+="$bre[$i]"
# Alternation and subexpressions
# ------------------------------
# Needed for special casing of *, ^ and $.
elif [[ $bre[$i] =~ '\||\(' && $s_esc == on ]]; then
s_altern=on
s_altern_start=on
ere+="$bre[$i]" # The escape sign is dropped now
s_esc=off
elif [[ $bre[$i] == '^' && $s_altern == on ]]; then
[[ $s_esc == off ]] || return 1
ere+='^'
elif [[ $bre[$i] == "*" && $s_altern == on ]]; then
# When * is at the start of an alternation/subexpression,
# escape it:
ere+='\\*'
elif [[ $bre[$i] == '$' ]]; then
# Dollar is a special character when it is at the end of
# string, end of alternation, or end of subexpression.
# Otherwise: it is not special. As we do not need to know the
# 'history' of the regex, we can use a lookahead to find out
# with which thing we are dealing.
#
# The behavior of $ is not documented in man 1 grep; it is,
# however, in `man 7 regex`. Some tests indicate that GNU grep
# adheres to the POSIX.2 specification.
if [[ $#bre -eq $i || ${bre:$i:2} =~ '\\(\||\))' ]]; then
[[ $s_esc == off ]] && ere+='$' || ere+='\\$'
else
[[ $s_esc == off ]] && ere+='\\$' || ere+='$'
fi
# Normal BRE to ERE:
# ------------------
elif [[ $bre[$i] =~ '\+|\{|\}|\)|\^|\?' && $s_esc == on ]]; then
# `(` and `|` are handled in the alternation part.
ere+="$bre[$i]" # The escape sign is dropped now
s_esc=off
elif [[ $bre[$i] == "'" && $s_esc == on ]]; then
ere+='$'
s_esc=off
elif [[ $bre[$i] == '`' && $s_esc == on ]]; then
ere+='^'
s_esc=off
elif [[ $bre[$i] =~ '\+|\||\{|\}|\(|\)|\^|\?' && $s_esc == off ]]; then
ere+="\\\\$bre[$i]"
elif [[ $bre[$i] == '\\' && $s_esc == on ]]; then
ere+='\\\\\\\\'
s_esc=off
elif [[ $bre[$i] == "\\" && $s_esc == off ]]; then
s_esc=on
else
ere+="$bre[$i]"
s_esc=off
fi
[[ $s_altern == on \
&& $s_altern_start == off \
&& $bre[$i] != "^" \
]] && s_altern=off
s_altern_start=off
done
printf -- "%s" "$ere"
}
ere2ere() {
# The only difference where grep EREs differ from gawk EREs, is
# the treatment of "\`" and "\'". There are some other differences
# (treatment of "'", and of "/") but those need to be dealt with
# irrespective of which input RE type has been chosen.
local grepere ere s_esc
s_esc=off
grepere=$1
ere=""
for i in {1..$#grepere}; do
if [[ $s_esc == off && $grepere[$i] == '\\' ]]; then
s_esc=on
elif [[ $s_esc == on && $grepere[$i] == '`' ]]; then
ere+='^'
s_esc=off
elif [[ $s_esc == on && $grepere[$i] == "'" ]]; then
ere+='$'
s_esc=off
elif [[ $s_esc == on ]]; then
ere+="\\\\$grepere[$i]"
s_esc=off
else
ere+="$grepere[$i]"
fi
done
printf -- "%s" "$ere"
}
# Some debugging/testing convenience functions.
# If grep2awk is autoloaded, the regex replacement
# functions can be called directly:
#
# grep2awk bre2ere 'a[^a-z]\(c$\|d\)'
#
# See the test suite for some examples.
[[ -n $1 ]] && {
echo $($1 "$2")
return $?
}
zstyle -s ':grep2awk:' awk awk
: ${awk:=awk}
# Store line as $reply[@]:
split-shell-arguments
# `flag` keeps the program state while looping through the splitted
# shell words. Its semantics:
# flag==0: No 'grep', egrep', 'fgrep' encountered yet.
# flag==1: Set when grep command is replaced by awk
# command. Options or regex.
# flag==2: Regex (End of options arrived).
# flag==3: The rest.
flag=0
cursor=0
for (( i=1; i<=$#reply; i++ )); do
if (( i%2==1 && flag < 3 )); then
# Uneven "words" are whitespace. Only interested in changing the
# cursor position if exchanging the regex is not replaced by the awk
# program yet (flag < 3).
(( cursor += $#reply[$i] ))
elif (( flag == 0 )); then
case $reply[$i] in
(grep)
flag=1
reply[$i]="$awk"
;;
(egrep)
flag=1
reply[$i]="$awk"
typeset "opt[E]"=1
;;
(fgrep)
flag=1
reply[$i]="$awk"
typeset "opt[F]"=1
;;
esac
(( cursor += $#reply[$i] ))
elif [[ $flag == 1 && $reply[$i] == -* ]]; then
# Is it the End of Options-marker?
if [[ $reply[$i] == "--" ]]; then
flag=2
elif [[ $reply[$i] =~ ^--. ]]; then
zle -M "grep2awk: Long options to grep are not supported"
return 1
else
for ((l=1; l<$#reply[$i]; l++)); do
#check those options!
# Is it an option to grep? A regular expression of
# the options that are recognized:
o=${reply[$i]:$l:1}
if [[ $o =~ "c|E|e|F|G|H|i|l|L|n|v|w|x|y" ]]; then
typeset "opt[$o]"=1
else
zle -M "grep2awk: Option $o is not supported"
return 1
fi
done
[[ $o == "e" ]] && flag=2
fi
# Delete the options, and the now superfluous whitespace:
reply[$i]=""
reply[$i+1]=""
# And leave the cursor pointer where it was.
elif (( flag==1 || flag==2 )); then
# The fall-through for opt_or_reg OR regex:
# This is the regex.
reply[$i]=$(regex2awkprog $reply[$i])
(( cursor += $#reply[$i] - 2 ))
flag=3
fi
# Append the current version of $reply[$i] to replacement_line:
replacement_line+=$reply[$i]
done
# Now we are done. Apply the changes to the BUFFER:
BUFFER="$replacement_line"
CURSOR=$cursor
}
grep2awk "$@"
# vim: ft=zsh