sbase/sort.c

435 lines
9.1 KiB
C
Raw Normal View History

2011-06-02 12:03:34 +00:00
/* See LICENSE file for copyright and license details. */
#include <ctype.h>
2011-06-02 12:03:34 +00:00
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "queue.h"
2011-06-02 12:03:34 +00:00
#include "text.h"
#include "utf.h"
2011-06-02 12:03:34 +00:00
#include "util.h"
struct keydef {
int start_column;
int end_column;
int start_char;
int end_char;
int flags;
TAILQ_ENTRY(keydef) entry;
};
enum {
MOD_N = 1 << 0,
MOD_STARTB = 1 << 1,
MOD_ENDB = 1 << 2,
MOD_R = 1 << 3,
MOD_D = 1 << 4,
MOD_F = 1 << 5,
MOD_I = 1 << 6,
};
static TAILQ_HEAD(kdhead, keydef) kdhead = TAILQ_HEAD_INITIALIZER(kdhead);
2014-11-23 19:35:56 +00:00
static int Cflag = 0, cflag = 0, uflag = 0;
2014-05-15 18:08:17 +00:00
static char *fieldsep = NULL;
static size_t fieldseplen = 0;
static struct line col1, col2;
static void
skipblank(struct line *a)
{
while (a->len && (*(a->data) == ' ' || *(a->data) == '\t')) {
a->data++;
a->len--;
}
}
static void
skipnonblank(struct line *a)
{
while (a->len && (*(a->data) != '\n' && *(a->data) != ' ' &&
*(a->data) != '\t')) {
a->data++;
a->len--;
}
}
2014-11-23 19:35:56 +00:00
static void
skipcolumn(struct line *a, int skip_to_next_col)
{
char *s;
if (fieldsep) {
if ((s = memmem(a->data, a->len, fieldsep, fieldseplen))) {
if (skip_to_next_col) {
s += fieldseplen;
a->data = s;
a->len = a->len - (s - a->data);
}
} else {
a->data += a->len - 1;
a->len = 1;
}
} else {
skipblank(a);
skipnonblank(a);
}
}
static size_t
columns(struct line *line, const struct keydef *kd, struct line *col)
{
Rune r;
struct line start, end;
size_t len, utflen, rlen;
int i;
start.data = line->data;
start.len = line->len;
for (i = 1; i < kd->start_column; i++)
skipcolumn(&start, 1);
if (kd->flags & MOD_STARTB)
skipblank(&start);
for (utflen = 0; start.len > 1 && utflen < kd->start_char - 1;) {
rlen = chartorune(&r, start.data);
start.data += rlen;
start.len -= rlen;
utflen++;
}
end.data = line->data;
end.len = line->len;
if (kd->end_column) {
for (i = 1; i < kd->end_column; i++)
skipcolumn(&end, 1);
if (kd->flags & MOD_ENDB)
skipblank(&end);
if (kd->end_char) {
for (utflen = 0; end.len > 1 && utflen < kd->end_char;) {
rlen = chartorune(&r, end.data);
end.data += rlen;
end.len -= rlen;
utflen++;
}
} else {
skipcolumn(&end, 0);
}
} else {
end.data += end.len - 1;
end.len = 1;
}
len = MAX(0, end.data - start.data);
if (!(col->data) || col->len < len)
col->data = erealloc(col->data, len + 1);
memcpy(col->data, start.data, len);
col->data[len] = '\0';
if (col->len < len)
col->len = len;
return len;
2014-11-23 19:35:56 +00:00
}
static int
skipmodcmp(struct line *a, struct line *b, int flags)
{
Rune r1, r2;
size_t offa = 0, offb = 0;
do {
offa += chartorune(&r1, a->data + offa);
offb += chartorune(&r2, b->data + offb);
if (flags & MOD_D && flags & MOD_I) {
while (offa < a->len && ((!isblankrune(r1) &&
!isalnumrune(r1)) || (!isprintrune(r1))))
offa += chartorune(&r1, a->data + offa);
while (offb < b->len && ((!isblankrune(r2) &&
!isalnumrune(r2)) || (!isprintrune(r2))))
offb += chartorune(&r2, b->data + offb);
}
else if (flags & MOD_D) {
while (offa < a->len && !isblankrune(r1) &&
!isalnumrune(r1))
offa += chartorune(&r1, a->data + offa);
while (offb < b->len && !isblankrune(r2) &&
!isalnumrune(r2))
offb += chartorune(&r2, b->data + offb);
}
else if (flags & MOD_I) {
while (offa < a->len && !isprintrune(r1))
offa += chartorune(&r1, a->data + offa);
while (offb < b->len && !isprintrune(r2))
offb += chartorune(&r2, b->data + offb);
}
if (flags & MOD_F) {
r1 = toupperrune(r1);
r2 = toupperrune(r2);
}
} while (r1 && r1 == r2);
return r1 - r2;
}
static int
slinecmp(struct line *a, struct line *b)
2011-06-02 12:03:34 +00:00
{
int res = 0;
long double x, y;
struct keydef *kd;
TAILQ_FOREACH(kd, &kdhead, entry) {
columns(a, kd, &col1);
columns(b, kd, &col2);
/* if -u is given, don't use default key definition
* unless it is the only one */
if (uflag && kd == TAILQ_LAST(&kdhead, kdhead) &&
TAILQ_LAST(&kdhead, kdhead) != TAILQ_FIRST(&kdhead)) {
res = 0;
} else if (kd->flags & MOD_N) {
x = strtold(col1.data, NULL);
y = strtold(col2.data, NULL);
res = (x < y) ? -1 : (x > y);
} else if (kd->flags & (MOD_D | MOD_F | MOD_I)) {
res = skipmodcmp(&col1, &col2, kd->flags);
} else {
res = linecmp(&col1, &col2);
}
if (kd->flags & MOD_R)
res = -res;
if (res)
break;
}
return res;
}
static int
check(FILE *fp, const char *fname)
{
static struct line prev, cur, tmp;
static size_t prevsize, cursize, tmpsize;
ssize_t len;
if (!prev.data) {
if ((len = getline(&prev.data, &prevsize, fp)) < 0)
eprintf("getline:");
prev.len = len;
}
while ((len = getline(&cur.data, &cursize, fp)) > 0) {
cur.len = len;
if (uflag > slinecmp(&cur, &prev)) {
if (!Cflag) {
weprintf("disorder %s: ", fname);
fwrite(cur.data, 1, cur.len, stderr);
}
return 1;
}
tmp = cur;
tmpsize = cursize;
cur = prev;
cursize = prevsize;
prev = tmp;
prevsize = tmpsize;
2013-12-12 13:08:49 +00:00
}
return 0;
2011-06-02 12:03:34 +00:00
}
2013-06-14 18:20:47 +00:00
static int
parse_flags(char **s, int *flags, int bflag)
{
while (isalpha((int)**s)) {
switch (*((*s)++)) {
case 'b':
*flags |= bflag;
break;
case 'd':
*flags |= MOD_D;
break;
case 'f':
*flags |= MOD_F;
break;
case 'i':
*flags |= MOD_I;
break;
case 'n':
*flags |= MOD_N;
break;
case 'r':
*flags |= MOD_R;
break;
default:
return -1;
}
}
return 0;
}
static void
addkeydef(char *kdstr, int flags)
{
struct keydef *kd;
kd = enmalloc(2, sizeof(*kd));
2014-04-18 16:21:31 +00:00
/* parse key definition kdstr with format
* start_column[.start_char][flags][,end_column[.end_char][flags]]
*/
kd->start_column = 1;
kd->start_char = 1;
kd->end_column = 0; /* 0 means end of line */
kd->end_char = 0; /* 0 means end of column */
kd->flags = flags;
if ((kd->start_column = strtol(kdstr, &kdstr, 10)) < 1)
enprintf(2, "invalid start column in key definition\n");
if (*kdstr == '.') {
if ((kd->start_char = strtol(kdstr + 1, &kdstr, 10)) < 1)
enprintf(2, "invalid start character in key "
"definition\n");
}
if (parse_flags(&kdstr, &kd->flags, MOD_STARTB) < 0)
enprintf(2, "invalid start flags in key definition\n");
if (*kdstr == ',') {
if ((kd->end_column = strtol(kdstr + 1, &kdstr, 10)) < 0)
enprintf(2, "invalid end column in key definition\n");
if (*kdstr == '.') {
if ((kd->end_char = strtol(kdstr + 1, &kdstr, 10)) < 0)
enprintf(2, "invalid end character in key "
"definition\n");
}
if (parse_flags(&kdstr, &kd->flags, MOD_ENDB) < 0)
enprintf(2, "invalid end flags in key definition\n");
}
if (*kdstr != '\0')
enprintf(2, "invalid key definition\n");
TAILQ_INSERT_TAIL(&kdhead, kd, entry);
}
static void
usage(void)
{
enprintf(2, "usage: %s [-Cbcdfimnru] [-o outfile] [-t delim] "
"[-k def]... [file ...]\n", argv0);
}
int
main(int argc, char *argv[])
{
FILE *fp, *ofp = stdout;
struct linebuf linebuf = EMPTY_LINEBUF;
size_t i;
Add *fshut() functions to properly flush file streams This has been a known issue for a long time. Example: printf "word" > /dev/full wouldn't report there's not enough space on the device. This is due to the fact that every libc has internal buffers for stdout which store fragments of written data until they reach a certain size or on some callback to flush them all at once to the kernel. You can force the libc to flush them with fflush(). In case flushing fails, you can check the return value of fflush() and report an error. However, previously, sbase didn't have such checks and without fflush(), the libc silently flushes the buffers on exit without checking the errors. No offense, but there's no way for the libc to report errors in the exit- condition. GNU coreutils solve this by having onexit-callbacks to handle the flushing and report issues, but they have obvious deficiencies. After long discussions on IRC, we came to the conclusion that checking the return value of every io-function would be a bit too much, and having a general-purpose fclose-wrapper would be the best way to go. It turned out that fclose() alone is not enough to detect errors. The right way to do it is to fflush() + check ferror on the fp and then to a fclose(). This is what fshut does and that's how it's done before each return. The return value is obviously affected, reporting an error in case a flush or close failed, but also when reading failed for some reason, the error- state is caught. the !!( ... + ...) construction is used to call all functions inside the brackets and not "terminating" on the first. We want errors to be reported, but there's no reason to stop flushing buffers when one other file buffer has issues. Obviously, functionales come before the flush and ret-logic comes after to prevent early exits as well without reporting warnings if there are any. One more advantage of fshut() is that it is even able to report errors on obscure NFS-setups which the other coreutils are unable to detect, because they only check the return-value of fflush() and fclose(), not ferror() as well.
2015-04-04 19:25:17 +00:00
int global_flags = 0, ret = 0;
char *outfile = NULL;
ARGBEGIN {
case 'C':
Cflag = 1;
break;
case 'b':
global_flags |= MOD_STARTB | MOD_ENDB;
break;
case 'c':
cflag = 1;
break;
case 'd':
global_flags |= MOD_D;
break;
case 'f':
global_flags |= MOD_F;
break;
case 'i':
global_flags |= MOD_I;
break;
case 'k':
addkeydef(EARGF(usage()), global_flags);
break;
case 'm':
/* more or less for free, but for performance-reasons,
* we should keep this flag in mind and maybe some later
* day implement it properly so we don't run out of memory
* while merging large sorted files.
*/
break;
case 'n':
global_flags |= MOD_N;
break;
case 'o':
outfile = EARGF(usage());
break;
case 'r':
global_flags |= MOD_R;
break;
case 't':
fieldsep = EARGF(usage());
if (!*fieldsep)
eprintf("empty delimiter\n");
fieldseplen = unescape(fieldsep);
break;
case 'u':
uflag = 1;
break;
default:
usage();
} ARGEND
2015-04-05 18:31:28 +00:00
/* -b shall only apply to custom key definitions */
if (TAILQ_EMPTY(&kdhead) && global_flags)
addkeydef("1", global_flags & ~(MOD_STARTB | MOD_ENDB));
addkeydef("1", global_flags & MOD_R);
if (!argc) {
if (Cflag || cflag) {
if (check(stdin, "<stdin>") && !ret)
ret = 1;
} else {
getlines(stdin, &linebuf);
}
} else for (; *argv; argc--, argv++) {
if (!strcmp(*argv, "-")) {
*argv = "<stdin>";
fp = stdin;
} else if (!(fp = fopen(*argv, "r"))) {
enprintf(2, "fopen %s:", *argv);
continue;
}
if (Cflag || cflag) {
if (check(fp, *argv) && !ret)
ret = 1;
} else {
getlines(fp, &linebuf);
}
if (fp != stdin && fshut(fp, *argv))
ret = 2;
}
if (!Cflag && !cflag) {
if (outfile && !(ofp = fopen(outfile, "w")))
eprintf("fopen %s:", outfile);
qsort(linebuf.lines, linebuf.nlines, sizeof(*linebuf.lines),
(int (*)(const void *, const void *))slinecmp);
for (i = 0; i < linebuf.nlines; i++) {
if (!uflag || i == 0 ||
slinecmp(&linebuf.lines[i], &linebuf.lines[i - 1])) {
fwrite(linebuf.lines[i].data, 1,
linebuf.lines[i].len, ofp);
}
}
}
if (fshut(stdin, "<stdin>") | fshut(stdout, "<stdout>") |
fshut(stderr, "<stderr>"))
ret = 2;
Add *fshut() functions to properly flush file streams This has been a known issue for a long time. Example: printf "word" > /dev/full wouldn't report there's not enough space on the device. This is due to the fact that every libc has internal buffers for stdout which store fragments of written data until they reach a certain size or on some callback to flush them all at once to the kernel. You can force the libc to flush them with fflush(). In case flushing fails, you can check the return value of fflush() and report an error. However, previously, sbase didn't have such checks and without fflush(), the libc silently flushes the buffers on exit without checking the errors. No offense, but there's no way for the libc to report errors in the exit- condition. GNU coreutils solve this by having onexit-callbacks to handle the flushing and report issues, but they have obvious deficiencies. After long discussions on IRC, we came to the conclusion that checking the return value of every io-function would be a bit too much, and having a general-purpose fclose-wrapper would be the best way to go. It turned out that fclose() alone is not enough to detect errors. The right way to do it is to fflush() + check ferror on the fp and then to a fclose(). This is what fshut does and that's how it's done before each return. The return value is obviously affected, reporting an error in case a flush or close failed, but also when reading failed for some reason, the error- state is caught. the !!( ... + ...) construction is used to call all functions inside the brackets and not "terminating" on the first. We want errors to be reported, but there's no reason to stop flushing buffers when one other file buffer has issues. Obviously, functionales come before the flush and ret-logic comes after to prevent early exits as well without reporting warnings if there are any. One more advantage of fshut() is that it is even able to report errors on obscure NFS-setups which the other coreutils are unable to detect, because they only check the return-value of fflush() and fclose(), not ferror() as well.
2015-04-04 19:25:17 +00:00
return ret;
}