diff --git a/tr.1 b/tr.1 index 81141d9..5c299c0 100644 --- a/tr.1 +++ b/tr.1 @@ -3,7 +3,7 @@ tr \- translate characters .SH SYNOPSIS .B tr -.RB [ \-d ] +.RB [ \-d ] [ \-c ] .RB set1 .P .B tr @@ -13,6 +13,9 @@ tr \- translate characters .TP .B \-d For compatibility. If given, characters in set1 will be deleted from the input and specifying set2 will result in an error. +.B \-c +Complementary, causes the specified character set to be inverted, this is all the characters not specified belong to it. +It only works in conjunction with \-d, because order doesn't make much sense with translation. .SH DESCRIPTION .B tr reads input from stdin replacing every character in @@ -50,9 +53,15 @@ If set1 is longer than set2 .B tr will map all the remaining characters to the last one in set2. In case set2 is longer than set1, the remaining characters from set2 will be ignored. .B +Character escape sequences, be them characters or octal numbers, are done preceding the token with a "\\". You may specify three digits or less for it, +digits will stop being read when a non-octal character or when three characters are read. +.B +Use "A-B" for ordered sets fom A to B. +.B .SH NOTES .B tr -is Unicode-aware but does not yet handle character classes (e.g. [:alnum:] or [:digit:]). +is Unicode-aware, but only if you don't specify characters in octal (for example \\012), because else it is not predictable. Does not support character +classes. .SH SEE ALSO .IR sed(1) .IR awk(1) diff --git a/tr.c b/tr.c index 388d42f..f902672 100644 --- a/tr.c +++ b/tr.c @@ -3,7 +3,6 @@ #include #include #include -#include #include #include #include "text.h" @@ -12,135 +11,316 @@ static void usage(void) { - eprintf("usage: %s [-d] set1 [set2]\n", argv0); + eprintf("usage: %s [-d] [-c] set1 [set2]\n", argv0); +} + +static int dflag, cflag; +static wchar_t mappings[0x110000]; + +struct wset_state { + char *s; /* current character */ + wchar_t rfirst, rlast; /* first and last in range */ + wchar_t prev; /* previous returned character */ + int prev_was_range; /* was the previous character part of a c-c range? */ +}; + +struct set_state { + char *s, rfirst, rlast, prev; + int prev_was_octal; /* was the previous returned character written in octal? */ +}; + +static void +set_state_defaults(struct set_state *s) +{ + s->rfirst = 1; + s->rlast = 0; + s->prev_was_octal = 1; } static void -handleescapes(char *s) +wset_state_defaults(struct wset_state *s) { + s->rfirst = 1; + s->rlast = 0; + s->prev_was_range = 1; +} + +/* sets *s to the char that was intended to be written. + * returns how many bytes the s pointer has to advance to skip the + * escape sequence if it was an octal, always zero otherwise. */ +static int +resolve_escape(char *s) +{ + int i; + unsigned char c; + switch(*s) { case 'n': *s = '\n'; - break; + return 0; case 't': *s = '\t'; - break; - case '\\': - *s = '\\'; - break; + return 0; case 'r': *s = '\r'; - break; + return 0; case 'f': *s = '\f'; - break; + return 0; case 'a': *s = '\a'; - break; + return 0; case 'b': *s = '\b'; - break; + return 0; case 'v': *s = '\v'; - break; + return 0; + case '\\': + *s = '\\'; + return 0; + case '\0': + eprintf("stray '\\' at end of input:"); + default: ; } + + if(*s<'0' || *s>'7') + eprintf("invalid character after '\\':"); + for(i=0, c=0; s[i]>='0' && s[i]<='7' && i<3; i++) { + c <<= 3; + c += s[i]-'0'; + } + if(*s>'3' && i==3) + eprintf("octal byte cannot be bigger than 377:"); + *s = c; + return i; } +#define embtowc(a, b) mbtowc(a, b, 4) + static int xmbtowc(wchar_t *unicodep, const char *s) { int rv; - rv = mbtowc(unicodep, s, 4); + rv = embtowc(unicodep, s); if (rv < 0) - eprintf("mbtowc:"); + eprintf("mbtowc: invalid input sequence:"); return rv; } -static void -parsemapping(const char *set1, const char *set2, wchar_t *mappings) +static int +has_octal_escapes(const char *s) { - char *s1, *s2; - wchar_t runeleft; - wchar_t runeright; - int leftbytes; - int rightbytes; + while(*s) + if(*s++ == '\\' && *s >= '0' && *s <= '7') + return 1; + return 0; +} - s1 = (char *)set1; - if(set2) - s2 = (char *)set2; - else - s2 = (char *)set1; +static char +get_next_char(struct set_state *s) +{ + char c; + int nchars; - while(*s1) { - if(*s1 == '\\') - handleescapes(++s1); - leftbytes = xmbtowc(&runeleft, s1); - s1 += leftbytes; - if(*s2 == '\\') - handleescapes(++s2); - if(*s2 != '\0') { - rightbytes = xmbtowc(&runeright, s2); - s2 += rightbytes; +start: + if(s->rfirst <= s->rlast) { + c = s->rfirst; + s->rfirst++; + return c; + } + + if(*s->s == '-' && !s->prev_was_octal) { + s->s++; + if(!*s->s) + return '-'; + if(*s->s == '\\' && (nchars = resolve_escape(++(s->s)))) + goto char_is_octal; + s->rlast = *(s->s)++; + if(!s->rlast) + return '\0'; + s->prev_was_octal = 1; + s->rfirst = ++(s->prev); + goto start; + } + if(*s->s == '\\' && (nchars = resolve_escape(++(s->s)))) + goto char_is_octal; + + s->prev_was_octal = 0; + c = *(s->s)++; + s->prev = c; + return c; + +char_is_octal: + s->prev_was_octal = 1; + c = *s->s; + s->s += nchars; + return c; +} + +static wchar_t +get_next_wchar(struct wset_state *s) +{ +start: + if(s->rfirst <= s->rlast) { + s->prev = s->rfirst; + s->rfirst++; + return s->prev; + } + + if(*s->s == '-' && !s->prev_was_range) { + s->s++; + if(!*s->s) + return '-'; + if(*s->s == '\\') + resolve_escape(++(s->s)); + s->s += xmbtowc(&s->rlast, s->s); + if(!s->rlast) + return '\0'; + s->rfirst = ++(s->prev); + s->prev_was_range = 1; + goto start; + } + + if(*s->s == '\\') + resolve_escape(++(s->s)); + s->s += xmbtowc(&s->prev, s->s); + s->prev_was_range = 0; + return s->prev; +} + +static int +is_mapping_wide(const char *set1, const char *set2) +{ + struct set_state ss1, ss2; + struct wset_state wss1, wss2; + wchar_t wc1, wc2, last_wc2; + + if(has_octal_escapes(set1)) { + set_state_defaults(&ss1); + ss1.s = (char *) set1; + if(set2) { + set_state_defaults(&ss2); + ss2.s = (char *) set2; + /* if the character returned is from an octal triplet, it might be null + and still need to continue */ + while((wc1 = (unsigned char) get_next_char(&ss1)) || ss1.prev_was_octal ) { + if(!(wc2 = (unsigned char) get_next_char(&ss2))) + wc2 = last_wc2; + mappings[wc1] = wc2; + last_wc2 = wc2; + } + } else { + while((wc1 = (unsigned char) get_next_char(&ss1)) || ss1.prev_was_octal) + mappings[wc1] = 1; } - mappings[runeleft] = runeright; + return 0; + } else { + wset_state_defaults(&wss1); + wss1.s = (char *) set1; + if(set2) { + wset_state_defaults(&wss2); + wss2.s = (char *) set2; + while((wc1 = get_next_wchar(&wss1))) { + if(!(wc2 = get_next_wchar(&wss2))) + wc2 = last_wc2; + mappings[wc1] = wc2; + last_wc2 = wc2; + } + } else { + while((wc1 = get_next_wchar(&wss1))) + mappings[wc1] = 1; + } + return 1; + } + return 0; /* unreachable */ +} + +static void +wmap_null(char *in, ssize_t nbytes) +{ + char *s; + wchar_t rune; + int parsed_bytes = 0; + + s = in; + while(nbytes) { + parsed_bytes = embtowc(&rune, s); + if(parsed_bytes < 0) { + rune = *s; + parsed_bytes = 1; + } + if(((!mappings[rune])&1) ^ cflag) + putwchar(rune); + s += parsed_bytes; + nbytes -= parsed_bytes; } } static void -maptonull(const wchar_t *mappings, char *in) +wmap_set(char *in, ssize_t nbytes) { - const char *s; - wchar_t runeleft; - int leftbytes = 0; + char *s; + wchar_t rune; + int parsed_bytes = 0; s = in; - while(*s) { - leftbytes = xmbtowc(&runeleft, s); - if(!mappings[runeleft]) - putwchar(runeleft); - s += leftbytes; - } -} - -static void -maptoset(const wchar_t *mappings, char *in) -{ - const char *s; - wchar_t runeleft; - int leftbytes = 0; - - s = in; - while(*s) { - leftbytes = xmbtowc(&runeleft, s); - if(!mappings[runeleft]) - putwchar(runeleft); + while(nbytes) { + parsed_bytes = embtowc(&rune, s); + if(parsed_bytes < 0) { + rune = *s; + parsed_bytes = 1; + } + if(!mappings[rune]) + putwchar(rune); else - putwchar(mappings[runeleft]); - s += leftbytes; + putwchar(mappings[rune]); + nbytes -= parsed_bytes; + s += parsed_bytes; } } +static void +map_null(char *in, ssize_t nbytes) +{ + char *s; + + for(s=in; nbytes; s++, nbytes--) + if(((!mappings[(unsigned char)*s])&1) ^ cflag) + putchar(*s); +} + +static void +map_set(char *in, ssize_t nbytes) +{ + char *s; + + for(s=in; nbytes; s++, nbytes--) + if(!mappings[(unsigned char)*s]) + putchar(*s); + else + putchar(mappings[(unsigned char)*s]); +} + int main(int argc, char *argv[]) { - wchar_t *mappings; char *buf = NULL; size_t size = 0; - void (*mapfunc)(const wchar_t*, char*); - int dflag = 0; + ssize_t nbytes; + void (*mapfunc)(char*, ssize_t); setlocale(LC_ALL, ""); - - mappings = mmap(NULL, 0x110000 * sizeof(wchar_t), - PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0); - if (mappings == MAP_FAILED) - eprintf("mmap:"); + dflag = cflag = 0; ARGBEGIN { case 'd': dflag = 1; break; + case 'c': + cflag = 1; + break; default: usage(); } ARGEND; @@ -148,25 +328,29 @@ main(int argc, char *argv[]) if(argc == 0) usage(); - if(dflag || argc == 1) { + if(dflag) { if(argc != 1) usage(); - parsemapping(argv[0], NULL, mappings); - mapfunc = maptonull; + if(is_mapping_wide(argv[0], NULL)) + mapfunc = wmap_null; + else + mapfunc = map_null; + } else if(cflag) { + usage(); + } else if(argc == 2) { + if(is_mapping_wide(argv[0], argv[1])) + mapfunc = wmap_set; + else + mapfunc = map_set; } else { - if(argc != 2) - usage(); - parsemapping(argv[0], argv[1], mappings); - mapfunc = maptoset; + usage(); } - while(agetline(&buf, &size, stdin) != -1) - mapfunc(mappings, buf); + while((nbytes = agetline(&buf, &size, stdin)) != -1) + mapfunc(buf, nbytes); free(buf); if(ferror(stdin)) eprintf(": read error:"); - munmap(mappings, 0x110000 * sizeof(wchar_t)); - return EXIT_SUCCESS; }