Add UTF-8-delimiter-support to cut(1)
Now you can specify a multibyte-delimiter to cut, which should definitely be possible for the end-user (Fuck POSIX). Looking at GNU/coreutils' cut(1)[0], which basically ignores the difference between characters and bytes, the -n-option and which is bloated as hell, one has to wonder why they are still default. This is insane! Things like this personally keep me motivated to make sbase better every day. [0]: http://git.savannah.gnu.org/gitweb/?p=coreutils.git;a=blob;f=src/cut.c;hb=HEAD NSFW! You have been warned.
This commit is contained in:
parent
dc70eb7976
commit
733b33f1c7
2
README
2
README
|
@ -22,7 +22,7 @@ The following tools are implemented ('*' == finished, '#' == UTF-8 support,
|
||||||
=* comm yes none
|
=* comm yes none
|
||||||
= cp no -H, -i, -L
|
= cp no -H, -i, -L
|
||||||
=* cron non-posix none
|
=* cron non-posix none
|
||||||
* cut yes none
|
#* cut yes none
|
||||||
= date yes none
|
= date yes none
|
||||||
= dirname yes none
|
= dirname yes none
|
||||||
= du no -H, -L, -x
|
= du no -H, -L, -x
|
||||||
|
|
4
cut.1
4
cut.1
|
@ -1,4 +1,4 @@
|
||||||
.Dd January 18, 2015
|
.Dd January 22, 2015
|
||||||
.Dt CUT 1 sbase\-VERSION
|
.Dt CUT 1 sbase\-VERSION
|
||||||
.Sh NAME
|
.Sh NAME
|
||||||
.Nm cut
|
.Nm cut
|
||||||
|
@ -67,4 +67,4 @@ utility is compliant with the
|
||||||
specification.
|
specification.
|
||||||
.Pp
|
.Pp
|
||||||
The possibility of separating numbers and ranges with a space
|
The possibility of separating numbers and ranges with a space
|
||||||
is an extension to that specification.
|
and specifying multibyte delimiters is an extension to that specification.
|
||||||
|
|
44
cut.c
44
cut.c
|
@ -4,6 +4,7 @@
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
#include "text.h"
|
#include "text.h"
|
||||||
|
#include "utf.h"
|
||||||
#include "util.h"
|
#include "util.h"
|
||||||
|
|
||||||
typedef struct Range {
|
typedef struct Range {
|
||||||
|
@ -13,7 +14,8 @@ typedef struct Range {
|
||||||
|
|
||||||
static Range *list = NULL;
|
static Range *list = NULL;
|
||||||
static char mode = 0;
|
static char mode = 0;
|
||||||
static char delim = '\t';
|
static Rune delim = '\t';
|
||||||
|
static size_t delimlen = 1;
|
||||||
static int nflag = 0;
|
static int nflag = 0;
|
||||||
static int sflag = 0;
|
static int sflag = 0;
|
||||||
|
|
||||||
|
@ -70,10 +72,11 @@ static size_t
|
||||||
seek(const char *s, size_t pos, size_t *prev, size_t count)
|
seek(const char *s, size_t pos, size_t *prev, size_t count)
|
||||||
{
|
{
|
||||||
const char *t;
|
const char *t;
|
||||||
size_t n = pos - *prev;
|
size_t n = pos - *prev, i;
|
||||||
|
Rune r;
|
||||||
|
|
||||||
if (mode == 'b') {
|
if (mode == 'b') {
|
||||||
if ((t = memchr(s, 0, n)))
|
if ((t = memchr(s, '\0', n)))
|
||||||
return t - s;
|
return t - s;
|
||||||
if (nflag)
|
if (nflag)
|
||||||
while (n && !UTF8_POINT(s[n]))
|
while (n && !UTF8_POINT(s[n]))
|
||||||
|
@ -85,11 +88,18 @@ seek(const char *s, size_t pos, size_t *prev, size_t count)
|
||||||
if (UTF8_POINT(*t) && !--n)
|
if (UTF8_POINT(*t) && !--n)
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
for (t = (count < 2) ? s : s + 1; n && *t; t++)
|
for (t = (count < delimlen + 1) ? s : s + delimlen; n && *t; ) {
|
||||||
if (*t == delim && !--n && count)
|
for (i = 1; t[i]; i++)
|
||||||
|
if (fullrune(t, i))
|
||||||
break;
|
break;
|
||||||
|
charntorune(&r, t, i);
|
||||||
|
if (r == delim && !--n && count)
|
||||||
|
break;
|
||||||
|
t += i;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
*prev = pos;
|
*prev = pos;
|
||||||
|
|
||||||
return t - s;
|
return t - s;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -106,20 +116,22 @@ cut(FILE *fp)
|
||||||
while ((len = getline(&buf, &size, fp)) != -1) {
|
while ((len = getline(&buf, &size, fp)) != -1) {
|
||||||
if (len && buf[len - 1] == '\n')
|
if (len && buf[len - 1] == '\n')
|
||||||
buf[len - 1] = '\0';
|
buf[len - 1] = '\0';
|
||||||
if (mode == 'f' && !strchr(buf, delim)) {
|
if (mode == 'f' && !utfrune(buf, delim)) {
|
||||||
if (!sflag)
|
if (!sflag)
|
||||||
puts(buf);
|
puts(buf);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
for (i = 0, p = 1, s = buf, r = list; r; r = r->next, s += n) {
|
for (i = 0, p = 1, s = buf, r = list; r; r = r->next, s += n) {
|
||||||
s += seek(s, r->min, &p, i++);
|
s += seek(s, r->min, &p, i);
|
||||||
|
i += (mode == 'f') ? delimlen : 1;
|
||||||
if (!*s)
|
if (!*s)
|
||||||
break;
|
break;
|
||||||
if (!r->max) {
|
if (!r->max) {
|
||||||
fputs(s, stdout);
|
fputs(s, stdout);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
n = seek(s, r->max + 1, &p, i++);
|
n = seek(s, r->max + 1, &p, i);
|
||||||
|
i += (mode == 'f') ? delimlen : 1;
|
||||||
if (fwrite(s, 1, n, stdout) != n)
|
if (fwrite(s, 1, n, stdout) != n)
|
||||||
eprintf("write error:");
|
eprintf("write error:");
|
||||||
}
|
}
|
||||||
|
@ -139,16 +151,27 @@ int
|
||||||
main(int argc, char *argv[])
|
main(int argc, char *argv[])
|
||||||
{
|
{
|
||||||
FILE *fp;
|
FILE *fp;
|
||||||
|
int i;
|
||||||
|
char *m, *d;
|
||||||
|
|
||||||
ARGBEGIN {
|
ARGBEGIN {
|
||||||
case 'b':
|
case 'b':
|
||||||
case 'c':
|
case 'c':
|
||||||
case 'f':
|
case 'f':
|
||||||
mode = ARGC();
|
mode = ARGC();
|
||||||
parselist(ARGF());
|
m = ARGF();
|
||||||
|
if (!m)
|
||||||
|
usage();
|
||||||
|
parselist(m);
|
||||||
break;
|
break;
|
||||||
case 'd':
|
case 'd':
|
||||||
delim = *ARGF();
|
if(!(d = ARGF()))
|
||||||
|
usage();
|
||||||
|
for (i = 1; i <= strlen(d); i++)
|
||||||
|
if (fullrune(d, i))
|
||||||
|
break;
|
||||||
|
charntorune(&delim, d, i);
|
||||||
|
delimlen = i;
|
||||||
break;
|
break;
|
||||||
case 'n':
|
case 'n':
|
||||||
nflag = 1;
|
nflag = 1;
|
||||||
|
@ -162,7 +185,6 @@ main(int argc, char *argv[])
|
||||||
|
|
||||||
if (!mode)
|
if (!mode)
|
||||||
usage();
|
usage();
|
||||||
|
|
||||||
if (!argc)
|
if (!argc)
|
||||||
cut(stdin);
|
cut(stdin);
|
||||||
else for (; argc--; argv++) {
|
else for (; argc--; argv++) {
|
||||||
|
|
Loading…
Reference in New Issue
Block a user