Improved tr

- Added support for character ranges ( a-z )
- Added support for complementary charset ( -c ), only in delete mode
- Added support for octal escape sequences
- Unicode now only works when there are no octal escape sequences,
  otherwise behavior is not predictable at first sight.
- tr now supports null characters in the input
- Does not yet have support for character classes ( [:upper:] )
This commit is contained in:
Adria Garriga
2014-07-15 00:49:42 +02:00
committed by sin
parent 8b3a9c1971
commit b3a63a60e4
2 changed files with 278 additions and 85 deletions

350
tr.c
View File

@@ -3,7 +3,6 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <locale.h>
#include <wchar.h>
#include "text.h"
@@ -12,135 +11,316 @@
static void
usage(void)
{
eprintf("usage: %s [-d] set1 [set2]\n", argv0);
eprintf("usage: %s [-d] [-c] set1 [set2]\n", argv0);
}
static int dflag, cflag;
static wchar_t mappings[0x110000];
struct wset_state {
char *s; /* current character */
wchar_t rfirst, rlast; /* first and last in range */
wchar_t prev; /* previous returned character */
int prev_was_range; /* was the previous character part of a c-c range? */
};
struct set_state {
char *s, rfirst, rlast, prev;
int prev_was_octal; /* was the previous returned character written in octal? */
};
static void
set_state_defaults(struct set_state *s)
{
s->rfirst = 1;
s->rlast = 0;
s->prev_was_octal = 1;
}
static void
handleescapes(char *s)
wset_state_defaults(struct wset_state *s)
{
s->rfirst = 1;
s->rlast = 0;
s->prev_was_range = 1;
}
/* sets *s to the char that was intended to be written.
* returns how many bytes the s pointer has to advance to skip the
* escape sequence if it was an octal, always zero otherwise. */
static int
resolve_escape(char *s)
{
int i;
unsigned char c;
switch(*s) {
case 'n':
*s = '\n';
break;
return 0;
case 't':
*s = '\t';
break;
case '\\':
*s = '\\';
break;
return 0;
case 'r':
*s = '\r';
break;
return 0;
case 'f':
*s = '\f';
break;
return 0;
case 'a':
*s = '\a';
break;
return 0;
case 'b':
*s = '\b';
break;
return 0;
case 'v':
*s = '\v';
break;
return 0;
case '\\':
*s = '\\';
return 0;
case '\0':
eprintf("stray '\\' at end of input:");
default: ;
}
if(*s<'0' || *s>'7')
eprintf("invalid character after '\\':");
for(i=0, c=0; s[i]>='0' && s[i]<='7' && i<3; i++) {
c <<= 3;
c += s[i]-'0';
}
if(*s>'3' && i==3)
eprintf("octal byte cannot be bigger than 377:");
*s = c;
return i;
}
#define embtowc(a, b) mbtowc(a, b, 4)
static int
xmbtowc(wchar_t *unicodep, const char *s)
{
int rv;
rv = mbtowc(unicodep, s, 4);
rv = embtowc(unicodep, s);
if (rv < 0)
eprintf("mbtowc:");
eprintf("mbtowc: invalid input sequence:");
return rv;
}
static void
parsemapping(const char *set1, const char *set2, wchar_t *mappings)
static int
has_octal_escapes(const char *s)
{
char *s1, *s2;
wchar_t runeleft;
wchar_t runeright;
int leftbytes;
int rightbytes;
while(*s)
if(*s++ == '\\' && *s >= '0' && *s <= '7')
return 1;
return 0;
}
s1 = (char *)set1;
if(set2)
s2 = (char *)set2;
else
s2 = (char *)set1;
static char
get_next_char(struct set_state *s)
{
char c;
int nchars;
while(*s1) {
if(*s1 == '\\')
handleescapes(++s1);
leftbytes = xmbtowc(&runeleft, s1);
s1 += leftbytes;
if(*s2 == '\\')
handleescapes(++s2);
if(*s2 != '\0') {
rightbytes = xmbtowc(&runeright, s2);
s2 += rightbytes;
start:
if(s->rfirst <= s->rlast) {
c = s->rfirst;
s->rfirst++;
return c;
}
if(*s->s == '-' && !s->prev_was_octal) {
s->s++;
if(!*s->s)
return '-';
if(*s->s == '\\' && (nchars = resolve_escape(++(s->s))))
goto char_is_octal;
s->rlast = *(s->s)++;
if(!s->rlast)
return '\0';
s->prev_was_octal = 1;
s->rfirst = ++(s->prev);
goto start;
}
if(*s->s == '\\' && (nchars = resolve_escape(++(s->s))))
goto char_is_octal;
s->prev_was_octal = 0;
c = *(s->s)++;
s->prev = c;
return c;
char_is_octal:
s->prev_was_octal = 1;
c = *s->s;
s->s += nchars;
return c;
}
static wchar_t
get_next_wchar(struct wset_state *s)
{
start:
if(s->rfirst <= s->rlast) {
s->prev = s->rfirst;
s->rfirst++;
return s->prev;
}
if(*s->s == '-' && !s->prev_was_range) {
s->s++;
if(!*s->s)
return '-';
if(*s->s == '\\')
resolve_escape(++(s->s));
s->s += xmbtowc(&s->rlast, s->s);
if(!s->rlast)
return '\0';
s->rfirst = ++(s->prev);
s->prev_was_range = 1;
goto start;
}
if(*s->s == '\\')
resolve_escape(++(s->s));
s->s += xmbtowc(&s->prev, s->s);
s->prev_was_range = 0;
return s->prev;
}
static int
is_mapping_wide(const char *set1, const char *set2)
{
struct set_state ss1, ss2;
struct wset_state wss1, wss2;
wchar_t wc1, wc2, last_wc2;
if(has_octal_escapes(set1)) {
set_state_defaults(&ss1);
ss1.s = (char *) set1;
if(set2) {
set_state_defaults(&ss2);
ss2.s = (char *) set2;
/* if the character returned is from an octal triplet, it might be null
and still need to continue */
while((wc1 = (unsigned char) get_next_char(&ss1)) || ss1.prev_was_octal ) {
if(!(wc2 = (unsigned char) get_next_char(&ss2)))
wc2 = last_wc2;
mappings[wc1] = wc2;
last_wc2 = wc2;
}
} else {
while((wc1 = (unsigned char) get_next_char(&ss1)) || ss1.prev_was_octal)
mappings[wc1] = 1;
}
mappings[runeleft] = runeright;
return 0;
} else {
wset_state_defaults(&wss1);
wss1.s = (char *) set1;
if(set2) {
wset_state_defaults(&wss2);
wss2.s = (char *) set2;
while((wc1 = get_next_wchar(&wss1))) {
if(!(wc2 = get_next_wchar(&wss2)))
wc2 = last_wc2;
mappings[wc1] = wc2;
last_wc2 = wc2;
}
} else {
while((wc1 = get_next_wchar(&wss1)))
mappings[wc1] = 1;
}
return 1;
}
return 0; /* unreachable */
}
static void
wmap_null(char *in, ssize_t nbytes)
{
char *s;
wchar_t rune;
int parsed_bytes = 0;
s = in;
while(nbytes) {
parsed_bytes = embtowc(&rune, s);
if(parsed_bytes < 0) {
rune = *s;
parsed_bytes = 1;
}
if(((!mappings[rune])&1) ^ cflag)
putwchar(rune);
s += parsed_bytes;
nbytes -= parsed_bytes;
}
}
static void
maptonull(const wchar_t *mappings, char *in)
wmap_set(char *in, ssize_t nbytes)
{
const char *s;
wchar_t runeleft;
int leftbytes = 0;
char *s;
wchar_t rune;
int parsed_bytes = 0;
s = in;
while(*s) {
leftbytes = xmbtowc(&runeleft, s);
if(!mappings[runeleft])
putwchar(runeleft);
s += leftbytes;
}
}
static void
maptoset(const wchar_t *mappings, char *in)
{
const char *s;
wchar_t runeleft;
int leftbytes = 0;
s = in;
while(*s) {
leftbytes = xmbtowc(&runeleft, s);
if(!mappings[runeleft])
putwchar(runeleft);
while(nbytes) {
parsed_bytes = embtowc(&rune, s);
if(parsed_bytes < 0) {
rune = *s;
parsed_bytes = 1;
}
if(!mappings[rune])
putwchar(rune);
else
putwchar(mappings[runeleft]);
s += leftbytes;
putwchar(mappings[rune]);
nbytes -= parsed_bytes;
s += parsed_bytes;
}
}
static void
map_null(char *in, ssize_t nbytes)
{
char *s;
for(s=in; nbytes; s++, nbytes--)
if(((!mappings[(unsigned char)*s])&1) ^ cflag)
putchar(*s);
}
static void
map_set(char *in, ssize_t nbytes)
{
char *s;
for(s=in; nbytes; s++, nbytes--)
if(!mappings[(unsigned char)*s])
putchar(*s);
else
putchar(mappings[(unsigned char)*s]);
}
int
main(int argc, char *argv[])
{
wchar_t *mappings;
char *buf = NULL;
size_t size = 0;
void (*mapfunc)(const wchar_t*, char*);
int dflag = 0;
ssize_t nbytes;
void (*mapfunc)(char*, ssize_t);
setlocale(LC_ALL, "");
mappings = mmap(NULL, 0x110000 * sizeof(wchar_t),
PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
if (mappings == MAP_FAILED)
eprintf("mmap:");
dflag = cflag = 0;
ARGBEGIN {
case 'd':
dflag = 1;
break;
case 'c':
cflag = 1;
break;
default:
usage();
} ARGEND;
@@ -148,25 +328,29 @@ main(int argc, char *argv[])
if(argc == 0)
usage();
if(dflag || argc == 1) {
if(dflag) {
if(argc != 1)
usage();
parsemapping(argv[0], NULL, mappings);
mapfunc = maptonull;
if(is_mapping_wide(argv[0], NULL))
mapfunc = wmap_null;
else
mapfunc = map_null;
} else if(cflag) {
usage();
} else if(argc == 2) {
if(is_mapping_wide(argv[0], argv[1]))
mapfunc = wmap_set;
else
mapfunc = map_set;
} else {
if(argc != 2)
usage();
parsemapping(argv[0], argv[1], mappings);
mapfunc = maptoset;
usage();
}
while(agetline(&buf, &size, stdin) != -1)
mapfunc(mappings, buf);
while((nbytes = agetline(&buf, &size, stdin)) != -1)
mapfunc(buf, nbytes);
free(buf);
if(ferror(stdin))
eprintf("<stdin>: read error:");
munmap(mappings, 0x110000 * sizeof(wchar_t));
return EXIT_SUCCESS;
}