Revamp tr(1) set parsing and handling
If you look at GNU coreutils, they do not support the mappings $ echo "1234abc" | tr "[:alnum:]" "[:upper:]" $ echo "ABCabc" | tr -c "[:upper:]" "[l*]" to only give a few examples. This commit broadens the scope of tr(1) as far as humanly possible to map between classes and non-classes, making tr a usable tool and actually fulfilling user expectations. Posix really is of no help here as it still kind of assumes the fixed ASCII table instead of complex Unicode code points or even Grapheme clusters.
This commit is contained in:
parent
9de401a495
commit
bc4c293fe5
78
tr.c
78
tr.c
|
@ -119,7 +119,7 @@ nextbrack:
|
||||||
}
|
}
|
||||||
|
|
||||||
/* REPEAT [_*n] (only allowed in set2) */
|
/* REPEAT [_*n] (only allowed in set2) */
|
||||||
if (j - i > 2 && rstr[i + 2] == '*' && set1ranges > 0) {
|
if (j - i > 2 && rstr[i + 2] == '*') {
|
||||||
/* check if right side of '*' is a number */
|
/* check if right side of '*' is a number */
|
||||||
q = 0;
|
q = 0;
|
||||||
factor = 1;
|
factor = 1;
|
||||||
|
@ -138,7 +138,7 @@ nextbrack:
|
||||||
}
|
}
|
||||||
(*set)[setranges].start = rstr[i + 1];
|
(*set)[setranges].start = rstr[i + 1];
|
||||||
(*set)[setranges].end = rstr[i + 1];
|
(*set)[setranges].end = rstr[i + 1];
|
||||||
(*set)[setranges].quant = q ? q : setlen(set1, set1ranges);
|
(*set)[setranges].quant = q ? q : setlen(set1, MAX(set1ranges, 1));
|
||||||
setranges++;
|
setranges++;
|
||||||
i = j;
|
i = j;
|
||||||
continue;
|
continue;
|
||||||
|
@ -196,38 +196,45 @@ main(int argc, char *argv[])
|
||||||
set1ranges = makeset(argv[0], &set1, &set1check);
|
set1ranges = makeset(argv[0], &set1, &set1check);
|
||||||
if (argc == 2)
|
if (argc == 2)
|
||||||
set2ranges = makeset(argv[1], &set2, &set2check);
|
set2ranges = makeset(argv[1], &set2, &set2check);
|
||||||
if (dflag == sflag && !set2ranges && !set2check)
|
|
||||||
eprintf("set2 must be non-empty.\n");
|
if (!dflag) {
|
||||||
if (argc == 2 && !set2check != !set1check)
|
/* sanity checks as we are translating */
|
||||||
eprintf("can't mix classes with non-classes.\n");
|
if (!set2ranges && !set2check)
|
||||||
if (set2check && set2check != islowerrune && set2check != isupperrune)
|
eprintf("cannot map to an empty set.\n");
|
||||||
eprintf("set2 can only be the 'lower' or 'upper' class.\n");
|
if (set2check && set2check != islowerrune &&
|
||||||
if (set2check && cflag && !dflag)
|
set2check != isupperrune) {
|
||||||
eprintf("set2 can't be imaged to from a complement.\n");
|
eprintf("can only map to 'lower' and 'upper' class.\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
read:
|
read:
|
||||||
if (!efgetrune(&r, stdin, "<stdin>")) {
|
if (!efgetrune(&r, stdin, "<stdin>")) {
|
||||||
ret |= fshut(stdin, "<stdin>") | fshut(stdout, "<stdout>");
|
ret |= fshut(stdin, "<stdin>") | fshut(stdout, "<stdout>");
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
off1 = off2 = 0;
|
for (i = 0, off1 = 0; i < set1ranges; i++, off1 += rangelen(set1[i])) {
|
||||||
for (i = 0; i < set1ranges; i++) {
|
|
||||||
if (set1[i].start <= r && r <= set1[i].end) {
|
if (set1[i].start <= r && r <= set1[i].end) {
|
||||||
if (dflag) {
|
if (dflag) {
|
||||||
if (!cflag)
|
if (cflag)
|
||||||
goto read;
|
continue;
|
||||||
else
|
else
|
||||||
goto write;
|
goto read;
|
||||||
}
|
}
|
||||||
if (cflag)
|
if (cflag)
|
||||||
goto write;
|
goto write;
|
||||||
for (m = 0; m < i; m++)
|
|
||||||
off1 += rangelen(set1[m]);
|
/* map r to set2 */
|
||||||
off1 += r - set1[m].start;
|
if (set2check) {
|
||||||
|
if (set2check == islowerrune)
|
||||||
|
r = tolowerrune(r);
|
||||||
|
else
|
||||||
|
r = toupperrune(r);
|
||||||
|
} else {
|
||||||
|
off1 += r - set1[i].start;
|
||||||
if (off1 > setlen(set2, set2ranges) - 1) {
|
if (off1 > setlen(set2, set2ranges) - 1) {
|
||||||
r = set2[set2ranges - 1].end;
|
r = set2[set2ranges - 1].end;
|
||||||
goto write;
|
goto write;
|
||||||
}
|
}
|
||||||
for (m = 0; m < set2ranges; m++) {
|
for (m = 0, off2 = 0; m < set2ranges; m++) {
|
||||||
if (off2 + rangelen(set2[m]) > off1) {
|
if (off2 + rangelen(set2[m]) > off1) {
|
||||||
m++;
|
m++;
|
||||||
break;
|
break;
|
||||||
|
@ -236,30 +243,33 @@ read:
|
||||||
}
|
}
|
||||||
m--;
|
m--;
|
||||||
r = set2[m].start + (off1 - off2) / set2[m].quant;
|
r = set2[m].start + (off1 - off2) / set2[m].quant;
|
||||||
|
}
|
||||||
if (sflag && (r == lastrune))
|
|
||||||
goto read;
|
|
||||||
goto write;
|
goto write;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (set1check && set1check(r)) {
|
if (set1check && set1check(r)) {
|
||||||
if (dflag) {
|
if (dflag && !cflag)
|
||||||
if (!cflag)
|
|
||||||
goto read;
|
goto read;
|
||||||
else
|
if (set2check) {
|
||||||
goto write;
|
if (set2check == islowerrune)
|
||||||
}
|
|
||||||
if (set1check == isupperrune && set2check == islowerrune)
|
|
||||||
r = tolowerrune(r);
|
r = tolowerrune(r);
|
||||||
else if (set1check == islowerrune && set2check == isupperrune)
|
|
||||||
r = toupperrune(r);
|
|
||||||
else if (set2ranges > 0)
|
|
||||||
r = cflag ? r : set2[set2ranges - 1].end;
|
|
||||||
else
|
else
|
||||||
eprintf("Misaligned character classes.\n");
|
r = toupperrune(r);
|
||||||
} else if (cflag && set2ranges > 0) {
|
} else {
|
||||||
r = set2[set2ranges - 1].end;
|
r = set2[set2ranges - 1].end;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
if (!dflag && cflag) {
|
||||||
|
if (set2check) {
|
||||||
|
if (set2check == islowerrune)
|
||||||
|
r = tolowerrune(r);
|
||||||
|
else
|
||||||
|
r = toupperrune(r);
|
||||||
|
} else {
|
||||||
|
r = set2[set2ranges - 1].end;
|
||||||
|
}
|
||||||
|
goto write;
|
||||||
|
}
|
||||||
if (dflag && cflag)
|
if (dflag && cflag)
|
||||||
goto read;
|
goto read;
|
||||||
write:
|
write:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user