-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtsort.c
297 lines (275 loc) · 9.99 KB
/
tsort.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
// tsort.c
// Copyright 2024 Ray Gardner
// License: 0BSD
// vi: tabstop=2 softtabstop=2 shiftwidth=2
//
// See https://pubs.opengroup.org/onlinepubs/9699919799/utilities/tsort.html
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <limits.h>
// A topological sort reads a set of relations, i.e. pairs of
// (predecessor, successor) symbols, and writes a list of unique symbols
// such that each predecessor appears before any of its successors.
// This is Knuth's Algorithm T, Topological sort,
// from The Art of Computer Programming Vol 1, sec. 2.2.3.
//
// The algorithm builds a table of symbols such that each entry has a count of
// its predecessors and a linked list of its successors. When all symbols have
// been read, the symbols with a predecessor count of zero are put into a
// queue. The symbol at the front of the queue is removed and output, and then
// the predecessor count of each of its successors is reduced by one. If any
// of those counts are reduced to zero, that successor's symbol is placed at
// the rear of the queue, and the process is repeated for the next symbol at
// the front of the queue. When the queue is empty, the algorithm checks if
// all the symbols have been output. If not, then there is a loop (cycle) in
// the set of relations. The algorithm walks 'backwards' through predecessors
// to print a cycle.
// The original algo expects relations as pairs drawn from consecutive
// integers starting at 1. This version assumes arbitrary symbols, so uses a
// hash table to relate the symbols to integers.
// Symbol table, indexed by integers in the hash table below.
// symoffs: offset of the symbol string in the input data
// pq: (Knuth's COUNT aka QLINK) count of predecessors of a symbol
// and used later as a link in the queue of symbols with no
// remaining predecessors
// succx: (Knuth's TOP) index into linked list of successors in the successors
// table, or 0 if no successors
struct symbols {int symoffs, pq, succx;};
// Successors table, indexed by symbols.succx field.
// symbx: (Knuths's SUC) index into symbols table
// next: (Knuth's NEXT) index to next successor, or 0
struct successors {int symbx, next;};
// Hash table to associate symbols with integer indexes to tables 'symbols'
// and 'successors'. The probe sequence is borrowed from Python dict, using
// the 'perturb' idea to mix in upper bits.
// See https://github.com/python/cpython/blob/main/Objects/dictobject.c
//
// dat is input data area; dat + offs points to symbol to look up.
// hashtbl is the actual hash table. Entries are 0 (unused) or the index into
// symbols table (stbl).
// hashmask is 2**n-1 where n is table size
// symcnt is count of symbols inserted
static int hash_lookup(char *dat, int offs,
int *hashtbl, int hashmask, int *symcnt, struct symbols *stbl)
{
int n, probe;
char *s = dat + offs;
unsigned perturb = 5381; // djb2a hash
while (*s) perturb = perturb * 33 ^ *(unsigned char *)s++;
probe = perturb & hashmask;
while ((n = hashtbl[probe])) {
if (! strcmp(dat + offs, dat + stbl[n].symoffs)) return n;
probe = (probe * 5 + 1 + (perturb >>= 5)) & hashmask;
}
hashtbl[probe] = ++*symcnt;
stbl[*symcnt].symoffs = offs;
return *symcnt;
}
void error_exit(char *s) {fprintf(stderr, "%s\n", s); exit(123);}
void do_tsort(char *dat, int plen)
{
struct symbols *stbl;
struct successors *succlist;
int i, j, k;
int nsym, nsym2;
int *hashtbl, hashmask, symcnt = 0;
char *pred = 0, *succ; // pointers to pair symbols (predecessor, successor)
int npred, nsucc; // indexes of pairs in stbl
int succ_cnt = 0; // number of successors added to succlist structure
char *whitespace = " \r\n\t";
int nleft, front, rear = 0;
// Make sure input has no NUL bytes
for (i = 0; i < plen; i++)
if (!dat[i]) dat[i] = ' ';
// T1. Initialize. Get number of pairs, allocate tables.
// Count input symbols (non-unique)
for (i = nsym = 0; i < plen; nsym++) {
i += strspn(dat + i, whitespace);
if (i == plen) break;
i += strcspn(dat + i, whitespace);
}
// nsym == number_of_pairs * 2
if (nsym & 1) error_exit("bad input (not pairs)");
// Worst case is every symbol is unique (a b, c d, ...)
// so need a hash table for at least nsym entries.
// Hash table must be large enough to load to < 100%
// Size must be power of 2; use next above (102% of nsym plus 5).
for (k = 2; k < nsym + nsym / 50 + 5; k *= 2)
;
hashmask = k - 1;
hashtbl = calloc(k, sizeof *hashtbl);
if (!hashtbl) error_exit("Cannot allocate hash table.");
// Need a symbols structure to hold (nsym + 1) symbols (stbl)
// plus successors structure (nsym / 2 + 1) successors (succlist).
// The +1 is to accommodate 1-based indexing.
stbl = calloc((nsym+1) * sizeof(*stbl) + (nsym/2+1) * sizeof(*succlist), 1);
if (!stbl) error_exit("Cannot allocate tables.");
succlist = (void *)&stbl[nsym+1];
// re-scan the symbols, enter in hashtbl and stbl.
for (i = nsym2 = 0; i < plen; nsym2++) {
// T2. Next relation. Get a relation from input.
i += strspn(dat + i, whitespace);
if (i == plen) break;
j = strcspn(dat + i, whitespace);
dat[i+j] = 0; // NUL terminate symbol. readfd() ensures safe at buffer end
if (! (nsym2 & 1)) {
pred = dat + i;
} else {
succ = dat + i;
npred = hash_lookup(dat, pred-dat, hashtbl, hashmask, &symcnt, stbl);
nsucc = hash_lookup(dat, i, hashtbl, hashmask, &symcnt, stbl);
// T3. Record the relation.
// Slight mod: if self-reference relation, do not record it.
// (If it's self-reference, the symbol is already in the table now.)
// Note that this links the most recent into the head of the successor
// list, reversing their order in the input. If the symbols table kept a
// tail pointer, they could be kept in the list in original order.
if (strcmp(pred, succ)) {
stbl[nsucc].pq++;
succlist[++succ_cnt].symbx = nsucc;
succlist[succ_cnt].next = stbl[npred].succx;
stbl[npred].succx = succ_cnt;
}
}
i += j + 1;
}
free(hashtbl); // Done using hash table.
// T4. Scan for zeros. Build a queue of symbols w/ no predecessor.
// Note Knuth's QLINK is our stbl[].pq.
stbl[0].pq = 0;
for (k = 1; k <= symcnt; k++) {
if (! stbl[k].pq) {
stbl[rear].pq = k;
rear = k;
}
}
front = stbl[0].pq;
// T5. Output front of queue.
nleft = symcnt; // This will count down as symbols are output.
while (front) {
printf("%s\n", dat + stbl[front].symoffs);
nleft--;
j = stbl[front].succx;
stbl[front].succx = 0; // Per Knuth 2.2.3 Exercise 23
// T6. Erase relations. Step through successors of symbol we just output,
// decreasing the predecessor count of each one. If that count went to
// zero, put that symbol on the queue.
while (j) {
stbl[succlist[j].symbx].pq--;
if (stbl[succlist[j].symbx].pq == 0) {
stbl[rear].pq = succlist[j].symbx;
rear = succlist[j].symbx;
}
j = succlist[j].next;
}
// T7. Remove the symbol from front of queue.
front = stbl[front].pq;
}
// T8. End, if OK. If any symbols left, there is a loop (cycle)
// in the associated graph. Find and print a loop.
if (nleft) {
int k0, k, kn, knn;
printf("input contains a loop:\n");
// Per TAOCP Knuth 2.2.3 Exercise 23, find and print a loop.
for (k = 1; k <= symcnt; k++) stbl[k].pq = 0;
// T9. (reuses succx and pq fields) For each symbol, set pq field to
// refer to a predecessor.
for (k = 1; k <= symcnt; k++) {
j = stbl[k].succx;
stbl[k].succx = 0;
// T10. Step thru successors, setting corresponding pq field.
while (j) {
if (!stbl[succlist[j].symbx].pq)
stbl[succlist[j].symbx].pq = k;
j = succlist[j].next;
}
}
// T11. Find a symbol with its pq field set.
for (k = 1; k <= symcnt; k++)
if (stbl[k].pq) break;
//if (k > symcnt) error_exit("bug"); // Not needed if algo is correct...
// T12. Step thru graph backwards, flagging symbols until start of loop.
do {
stbl[k].succx = 1;
k = stbl[k].pq;
} while (stbl[k].succx == 0);
// Per note after T14, the loop links backwards. Reverse the loop.
// Not sure if this is the cleanest way... (rdg)
k0 = k;
kn = stbl[k].pq;
do {
knn = stbl[kn].pq;
stbl[kn].pq = k;
k = kn;
kn = knn;
} while (k != k0);
// T13. Step through loop, printing symbols.
do {
printf("%s ->\n", dat + stbl[k].symoffs);
stbl[k].succx = 0;
k = stbl[k].pq;
} while (stbl[k].succx == 1);
// T14. Print first element of loop again to tie it off.
printf("%s\n", dat + stbl[k].symoffs);
}
free(stbl);
}
char *read_input(FILE *fp, int *plen)
{
int bufsize = 4096;
long len = 0;
char *d;
if (fp != stdin && !fseek(fp, 0, SEEK_END)) {
// Seekable file. (?)
if ((len = ftell(fp)) < 0 || len >= INT_MAX) return 0;
rewind(fp);
if (!(d = malloc(len + 1))) return 0;
if ((long)fread(d, 1, len, fp) != len)
{
free(d);
return 0;
}
d[*plen = len] = 0;
return d;
}
// Non seekable; e.g. stdin from terminal or pipe
// Need to read into expandable buffer until EOF
if (!(d = malloc(bufsize+1))) return 0;
int k = 0;
int offset = 0;
while (!feof(fp)) {
k = fread(d + offset, 1, bufsize - offset, fp);
len += k;
if (k < bufsize - offset && feof(fp)) break;
offset += k;
if (offset == bufsize) {
bufsize = bufsize * 3 / 2;
char *dd = realloc(d, bufsize + 1);
if (!dd) {
free(d);
return 0;
}
d = dd;
}
}
d[*plen = len] = 0;
return d;
}
int main(int argc, char **argv)
{
char *dat;
int plen;
FILE *fp = stdin;
if (argc > 2) error_exit("Usage: tsort [infile]\ndefault is stdin");
if (argc == 2) {
if (strcmp(argv[1], "-"))
fp = fopen(argv[1], "rb");
}
if (!fp) error_exit("Can't open file");
dat = read_input(fp, &plen);
if (!dat) error_exit("cannot read input");
do_tsort(dat, plen);
return 0;
}