Improved tr
- Added support for character ranges ( a-z ) - Added support for complementary charset ( -c ), only in delete mode - Added support for octal escape sequences - Unicode now only works when there are no octal escape sequences, otherwise behavior is not predictable at first sight. - tr now supports null characters in the input - Does not yet have support for character classes ( [:upper:] )
This commit is contained in:
parent
8b3a9c1971
commit
b3a63a60e4
13
tr.1
13
tr.1
|
@ -3,7 +3,7 @@
|
||||||
tr \- translate characters
|
tr \- translate characters
|
||||||
.SH SYNOPSIS
|
.SH SYNOPSIS
|
||||||
.B tr
|
.B tr
|
||||||
.RB [ \-d ]
|
.RB [ \-d ] [ \-c ]
|
||||||
.RB set1
|
.RB set1
|
||||||
.P
|
.P
|
||||||
.B tr
|
.B tr
|
||||||
|
@ -13,6 +13,9 @@ tr \- translate characters
|
||||||
.TP
|
.TP
|
||||||
.B \-d
|
.B \-d
|
||||||
For compatibility. If given, characters in set1 will be deleted from the input and specifying set2 will result in an error.
|
For compatibility. If given, characters in set1 will be deleted from the input and specifying set2 will result in an error.
|
||||||
|
.B \-c
|
||||||
|
Complementary, causes the specified character set to be inverted, this is all the characters not specified belong to it.
|
||||||
|
It only works in conjunction with \-d, because order doesn't make much sense with translation.
|
||||||
.SH DESCRIPTION
|
.SH DESCRIPTION
|
||||||
.B tr
|
.B tr
|
||||||
reads input from stdin replacing every character in
|
reads input from stdin replacing every character in
|
||||||
|
@ -50,9 +53,15 @@ If set1 is longer than set2
|
||||||
.B tr
|
.B tr
|
||||||
will map all the remaining characters to the last one in set2. In case set2 is longer than set1, the remaining characters from set2 will be ignored.
|
will map all the remaining characters to the last one in set2. In case set2 is longer than set1, the remaining characters from set2 will be ignored.
|
||||||
.B
|
.B
|
||||||
|
Character escape sequences, be them characters or octal numbers, are done preceding the token with a "\\". You may specify three digits or less for it,
|
||||||
|
digits will stop being read when a non-octal character or when three characters are read.
|
||||||
|
.B
|
||||||
|
Use "A-B" for ordered sets fom A to B.
|
||||||
|
.B
|
||||||
.SH NOTES
|
.SH NOTES
|
||||||
.B tr
|
.B tr
|
||||||
is Unicode-aware but does not yet handle character classes (e.g. [:alnum:] or [:digit:]).
|
is Unicode-aware, but only if you don't specify characters in octal (for example \\012), because else it is not predictable. Does not support character
|
||||||
|
classes.
|
||||||
.SH SEE ALSO
|
.SH SEE ALSO
|
||||||
.IR sed(1)
|
.IR sed(1)
|
||||||
.IR awk(1)
|
.IR awk(1)
|
||||||
|
|
350
tr.c
350
tr.c
|
@ -3,7 +3,6 @@
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <sys/mman.h>
|
|
||||||
#include <locale.h>
|
#include <locale.h>
|
||||||
#include <wchar.h>
|
#include <wchar.h>
|
||||||
#include "text.h"
|
#include "text.h"
|
||||||
|
@ -12,135 +11,316 @@
|
||||||
static void
|
static void
|
||||||
usage(void)
|
usage(void)
|
||||||
{
|
{
|
||||||
eprintf("usage: %s [-d] set1 [set2]\n", argv0);
|
eprintf("usage: %s [-d] [-c] set1 [set2]\n", argv0);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int dflag, cflag;
|
||||||
|
static wchar_t mappings[0x110000];
|
||||||
|
|
||||||
|
struct wset_state {
|
||||||
|
char *s; /* current character */
|
||||||
|
wchar_t rfirst, rlast; /* first and last in range */
|
||||||
|
wchar_t prev; /* previous returned character */
|
||||||
|
int prev_was_range; /* was the previous character part of a c-c range? */
|
||||||
|
};
|
||||||
|
|
||||||
|
struct set_state {
|
||||||
|
char *s, rfirst, rlast, prev;
|
||||||
|
int prev_was_octal; /* was the previous returned character written in octal? */
|
||||||
|
};
|
||||||
|
|
||||||
|
static void
|
||||||
|
set_state_defaults(struct set_state *s)
|
||||||
|
{
|
||||||
|
s->rfirst = 1;
|
||||||
|
s->rlast = 0;
|
||||||
|
s->prev_was_octal = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
handleescapes(char *s)
|
wset_state_defaults(struct wset_state *s)
|
||||||
{
|
{
|
||||||
|
s->rfirst = 1;
|
||||||
|
s->rlast = 0;
|
||||||
|
s->prev_was_range = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* sets *s to the char that was intended to be written.
|
||||||
|
* returns how many bytes the s pointer has to advance to skip the
|
||||||
|
* escape sequence if it was an octal, always zero otherwise. */
|
||||||
|
static int
|
||||||
|
resolve_escape(char *s)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
unsigned char c;
|
||||||
|
|
||||||
switch(*s) {
|
switch(*s) {
|
||||||
case 'n':
|
case 'n':
|
||||||
*s = '\n';
|
*s = '\n';
|
||||||
break;
|
return 0;
|
||||||
case 't':
|
case 't':
|
||||||
*s = '\t';
|
*s = '\t';
|
||||||
break;
|
return 0;
|
||||||
case '\\':
|
|
||||||
*s = '\\';
|
|
||||||
break;
|
|
||||||
case 'r':
|
case 'r':
|
||||||
*s = '\r';
|
*s = '\r';
|
||||||
break;
|
return 0;
|
||||||
case 'f':
|
case 'f':
|
||||||
*s = '\f';
|
*s = '\f';
|
||||||
break;
|
return 0;
|
||||||
case 'a':
|
case 'a':
|
||||||
*s = '\a';
|
*s = '\a';
|
||||||
break;
|
return 0;
|
||||||
case 'b':
|
case 'b':
|
||||||
*s = '\b';
|
*s = '\b';
|
||||||
break;
|
return 0;
|
||||||
case 'v':
|
case 'v':
|
||||||
*s = '\v';
|
*s = '\v';
|
||||||
break;
|
return 0;
|
||||||
|
case '\\':
|
||||||
|
*s = '\\';
|
||||||
|
return 0;
|
||||||
|
case '\0':
|
||||||
|
eprintf("stray '\\' at end of input:");
|
||||||
|
default: ;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if(*s<'0' || *s>'7')
|
||||||
|
eprintf("invalid character after '\\':");
|
||||||
|
for(i=0, c=0; s[i]>='0' && s[i]<='7' && i<3; i++) {
|
||||||
|
c <<= 3;
|
||||||
|
c += s[i]-'0';
|
||||||
|
}
|
||||||
|
if(*s>'3' && i==3)
|
||||||
|
eprintf("octal byte cannot be bigger than 377:");
|
||||||
|
*s = c;
|
||||||
|
return i;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define embtowc(a, b) mbtowc(a, b, 4)
|
||||||
|
|
||||||
static int
|
static int
|
||||||
xmbtowc(wchar_t *unicodep, const char *s)
|
xmbtowc(wchar_t *unicodep, const char *s)
|
||||||
{
|
{
|
||||||
int rv;
|
int rv;
|
||||||
|
|
||||||
rv = mbtowc(unicodep, s, 4);
|
rv = embtowc(unicodep, s);
|
||||||
if (rv < 0)
|
if (rv < 0)
|
||||||
eprintf("mbtowc:");
|
eprintf("mbtowc: invalid input sequence:");
|
||||||
return rv;
|
return rv;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static int
|
||||||
parsemapping(const char *set1, const char *set2, wchar_t *mappings)
|
has_octal_escapes(const char *s)
|
||||||
{
|
{
|
||||||
char *s1, *s2;
|
while(*s)
|
||||||
wchar_t runeleft;
|
if(*s++ == '\\' && *s >= '0' && *s <= '7')
|
||||||
wchar_t runeright;
|
return 1;
|
||||||
int leftbytes;
|
return 0;
|
||||||
int rightbytes;
|
}
|
||||||
|
|
||||||
s1 = (char *)set1;
|
static char
|
||||||
if(set2)
|
get_next_char(struct set_state *s)
|
||||||
s2 = (char *)set2;
|
{
|
||||||
else
|
char c;
|
||||||
s2 = (char *)set1;
|
int nchars;
|
||||||
|
|
||||||
while(*s1) {
|
start:
|
||||||
if(*s1 == '\\')
|
if(s->rfirst <= s->rlast) {
|
||||||
handleescapes(++s1);
|
c = s->rfirst;
|
||||||
leftbytes = xmbtowc(&runeleft, s1);
|
s->rfirst++;
|
||||||
s1 += leftbytes;
|
return c;
|
||||||
if(*s2 == '\\')
|
}
|
||||||
handleescapes(++s2);
|
|
||||||
if(*s2 != '\0') {
|
if(*s->s == '-' && !s->prev_was_octal) {
|
||||||
rightbytes = xmbtowc(&runeright, s2);
|
s->s++;
|
||||||
s2 += rightbytes;
|
if(!*s->s)
|
||||||
|
return '-';
|
||||||
|
if(*s->s == '\\' && (nchars = resolve_escape(++(s->s))))
|
||||||
|
goto char_is_octal;
|
||||||
|
s->rlast = *(s->s)++;
|
||||||
|
if(!s->rlast)
|
||||||
|
return '\0';
|
||||||
|
s->prev_was_octal = 1;
|
||||||
|
s->rfirst = ++(s->prev);
|
||||||
|
goto start;
|
||||||
|
}
|
||||||
|
if(*s->s == '\\' && (nchars = resolve_escape(++(s->s))))
|
||||||
|
goto char_is_octal;
|
||||||
|
|
||||||
|
s->prev_was_octal = 0;
|
||||||
|
c = *(s->s)++;
|
||||||
|
s->prev = c;
|
||||||
|
return c;
|
||||||
|
|
||||||
|
char_is_octal:
|
||||||
|
s->prev_was_octal = 1;
|
||||||
|
c = *s->s;
|
||||||
|
s->s += nchars;
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
static wchar_t
|
||||||
|
get_next_wchar(struct wset_state *s)
|
||||||
|
{
|
||||||
|
start:
|
||||||
|
if(s->rfirst <= s->rlast) {
|
||||||
|
s->prev = s->rfirst;
|
||||||
|
s->rfirst++;
|
||||||
|
return s->prev;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(*s->s == '-' && !s->prev_was_range) {
|
||||||
|
s->s++;
|
||||||
|
if(!*s->s)
|
||||||
|
return '-';
|
||||||
|
if(*s->s == '\\')
|
||||||
|
resolve_escape(++(s->s));
|
||||||
|
s->s += xmbtowc(&s->rlast, s->s);
|
||||||
|
if(!s->rlast)
|
||||||
|
return '\0';
|
||||||
|
s->rfirst = ++(s->prev);
|
||||||
|
s->prev_was_range = 1;
|
||||||
|
goto start;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(*s->s == '\\')
|
||||||
|
resolve_escape(++(s->s));
|
||||||
|
s->s += xmbtowc(&s->prev, s->s);
|
||||||
|
s->prev_was_range = 0;
|
||||||
|
return s->prev;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
is_mapping_wide(const char *set1, const char *set2)
|
||||||
|
{
|
||||||
|
struct set_state ss1, ss2;
|
||||||
|
struct wset_state wss1, wss2;
|
||||||
|
wchar_t wc1, wc2, last_wc2;
|
||||||
|
|
||||||
|
if(has_octal_escapes(set1)) {
|
||||||
|
set_state_defaults(&ss1);
|
||||||
|
ss1.s = (char *) set1;
|
||||||
|
if(set2) {
|
||||||
|
set_state_defaults(&ss2);
|
||||||
|
ss2.s = (char *) set2;
|
||||||
|
/* if the character returned is from an octal triplet, it might be null
|
||||||
|
and still need to continue */
|
||||||
|
while((wc1 = (unsigned char) get_next_char(&ss1)) || ss1.prev_was_octal ) {
|
||||||
|
if(!(wc2 = (unsigned char) get_next_char(&ss2)))
|
||||||
|
wc2 = last_wc2;
|
||||||
|
mappings[wc1] = wc2;
|
||||||
|
last_wc2 = wc2;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
while((wc1 = (unsigned char) get_next_char(&ss1)) || ss1.prev_was_octal)
|
||||||
|
mappings[wc1] = 1;
|
||||||
}
|
}
|
||||||
mappings[runeleft] = runeright;
|
return 0;
|
||||||
|
} else {
|
||||||
|
wset_state_defaults(&wss1);
|
||||||
|
wss1.s = (char *) set1;
|
||||||
|
if(set2) {
|
||||||
|
wset_state_defaults(&wss2);
|
||||||
|
wss2.s = (char *) set2;
|
||||||
|
while((wc1 = get_next_wchar(&wss1))) {
|
||||||
|
if(!(wc2 = get_next_wchar(&wss2)))
|
||||||
|
wc2 = last_wc2;
|
||||||
|
mappings[wc1] = wc2;
|
||||||
|
last_wc2 = wc2;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
while((wc1 = get_next_wchar(&wss1)))
|
||||||
|
mappings[wc1] = 1;
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
return 0; /* unreachable */
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
wmap_null(char *in, ssize_t nbytes)
|
||||||
|
{
|
||||||
|
char *s;
|
||||||
|
wchar_t rune;
|
||||||
|
int parsed_bytes = 0;
|
||||||
|
|
||||||
|
s = in;
|
||||||
|
while(nbytes) {
|
||||||
|
parsed_bytes = embtowc(&rune, s);
|
||||||
|
if(parsed_bytes < 0) {
|
||||||
|
rune = *s;
|
||||||
|
parsed_bytes = 1;
|
||||||
|
}
|
||||||
|
if(((!mappings[rune])&1) ^ cflag)
|
||||||
|
putwchar(rune);
|
||||||
|
s += parsed_bytes;
|
||||||
|
nbytes -= parsed_bytes;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
maptonull(const wchar_t *mappings, char *in)
|
wmap_set(char *in, ssize_t nbytes)
|
||||||
{
|
{
|
||||||
const char *s;
|
char *s;
|
||||||
wchar_t runeleft;
|
wchar_t rune;
|
||||||
int leftbytes = 0;
|
int parsed_bytes = 0;
|
||||||
|
|
||||||
s = in;
|
s = in;
|
||||||
while(*s) {
|
while(nbytes) {
|
||||||
leftbytes = xmbtowc(&runeleft, s);
|
parsed_bytes = embtowc(&rune, s);
|
||||||
if(!mappings[runeleft])
|
if(parsed_bytes < 0) {
|
||||||
putwchar(runeleft);
|
rune = *s;
|
||||||
s += leftbytes;
|
parsed_bytes = 1;
|
||||||
}
|
}
|
||||||
}
|
if(!mappings[rune])
|
||||||
|
putwchar(rune);
|
||||||
static void
|
|
||||||
maptoset(const wchar_t *mappings, char *in)
|
|
||||||
{
|
|
||||||
const char *s;
|
|
||||||
wchar_t runeleft;
|
|
||||||
int leftbytes = 0;
|
|
||||||
|
|
||||||
s = in;
|
|
||||||
while(*s) {
|
|
||||||
leftbytes = xmbtowc(&runeleft, s);
|
|
||||||
if(!mappings[runeleft])
|
|
||||||
putwchar(runeleft);
|
|
||||||
else
|
else
|
||||||
putwchar(mappings[runeleft]);
|
putwchar(mappings[rune]);
|
||||||
s += leftbytes;
|
nbytes -= parsed_bytes;
|
||||||
|
s += parsed_bytes;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
map_null(char *in, ssize_t nbytes)
|
||||||
|
{
|
||||||
|
char *s;
|
||||||
|
|
||||||
|
for(s=in; nbytes; s++, nbytes--)
|
||||||
|
if(((!mappings[(unsigned char)*s])&1) ^ cflag)
|
||||||
|
putchar(*s);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
map_set(char *in, ssize_t nbytes)
|
||||||
|
{
|
||||||
|
char *s;
|
||||||
|
|
||||||
|
for(s=in; nbytes; s++, nbytes--)
|
||||||
|
if(!mappings[(unsigned char)*s])
|
||||||
|
putchar(*s);
|
||||||
|
else
|
||||||
|
putchar(mappings[(unsigned char)*s]);
|
||||||
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
main(int argc, char *argv[])
|
main(int argc, char *argv[])
|
||||||
{
|
{
|
||||||
wchar_t *mappings;
|
|
||||||
char *buf = NULL;
|
char *buf = NULL;
|
||||||
size_t size = 0;
|
size_t size = 0;
|
||||||
void (*mapfunc)(const wchar_t*, char*);
|
ssize_t nbytes;
|
||||||
int dflag = 0;
|
void (*mapfunc)(char*, ssize_t);
|
||||||
|
|
||||||
setlocale(LC_ALL, "");
|
setlocale(LC_ALL, "");
|
||||||
|
dflag = cflag = 0;
|
||||||
mappings = mmap(NULL, 0x110000 * sizeof(wchar_t),
|
|
||||||
PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
|
|
||||||
if (mappings == MAP_FAILED)
|
|
||||||
eprintf("mmap:");
|
|
||||||
|
|
||||||
ARGBEGIN {
|
ARGBEGIN {
|
||||||
case 'd':
|
case 'd':
|
||||||
dflag = 1;
|
dflag = 1;
|
||||||
break;
|
break;
|
||||||
|
case 'c':
|
||||||
|
cflag = 1;
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
usage();
|
usage();
|
||||||
} ARGEND;
|
} ARGEND;
|
||||||
|
@ -148,25 +328,29 @@ main(int argc, char *argv[])
|
||||||
if(argc == 0)
|
if(argc == 0)
|
||||||
usage();
|
usage();
|
||||||
|
|
||||||
if(dflag || argc == 1) {
|
if(dflag) {
|
||||||
if(argc != 1)
|
if(argc != 1)
|
||||||
usage();
|
usage();
|
||||||
parsemapping(argv[0], NULL, mappings);
|
if(is_mapping_wide(argv[0], NULL))
|
||||||
mapfunc = maptonull;
|
mapfunc = wmap_null;
|
||||||
|
else
|
||||||
|
mapfunc = map_null;
|
||||||
|
} else if(cflag) {
|
||||||
|
usage();
|
||||||
|
} else if(argc == 2) {
|
||||||
|
if(is_mapping_wide(argv[0], argv[1]))
|
||||||
|
mapfunc = wmap_set;
|
||||||
|
else
|
||||||
|
mapfunc = map_set;
|
||||||
} else {
|
} else {
|
||||||
if(argc != 2)
|
usage();
|
||||||
usage();
|
|
||||||
parsemapping(argv[0], argv[1], mappings);
|
|
||||||
mapfunc = maptoset;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
while(agetline(&buf, &size, stdin) != -1)
|
while((nbytes = agetline(&buf, &size, stdin)) != -1)
|
||||||
mapfunc(mappings, buf);
|
mapfunc(buf, nbytes);
|
||||||
free(buf);
|
free(buf);
|
||||||
if(ferror(stdin))
|
if(ferror(stdin))
|
||||||
eprintf("<stdin>: read error:");
|
eprintf("<stdin>: read error:");
|
||||||
|
|
||||||
munmap(mappings, 0x110000 * sizeof(wchar_t));
|
|
||||||
|
|
||||||
return EXIT_SUCCESS;
|
return EXIT_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user