Add join(1)
This commit is contained in:
parent
f83d7bc647
commit
cd0b771cbb
1
LICENSE
1
LICENSE
|
@ -58,3 +58,4 @@ Authors/contributors include:
|
|||
© 2015 Tai Chi Minh Ralph Eastwood <tcmreastwood@gmail.com>
|
||||
© 2015 Quentin Rameau <quinq@quinq.eu.org>
|
||||
© 2015 Dionysis Grigoropoulos <info@erethon.com>
|
||||
© 2015 Wolfgang Corcoran-Mathe <first.lord.of.teal@gmail.com>
|
||||
|
|
1
README
1
README
|
@ -40,6 +40,7 @@ The following tools are implemented:
|
|||
=* o grep .
|
||||
=*|o head .
|
||||
=*|x hostname .
|
||||
=* o join .
|
||||
=*|o kill .
|
||||
=*|o link .
|
||||
=*|o ln .
|
||||
|
|
105
join.1
Normal file
105
join.1
Normal file
|
@ -0,0 +1,105 @@
|
|||
.Dd April 18, 2015
|
||||
.Dt JOIN 1
|
||||
.Os sbase
|
||||
.Sh NAME
|
||||
.Nm join
|
||||
.Nd relational database operator
|
||||
.Sh SYNOPSIS
|
||||
.Nm
|
||||
.Op Fl 1 Ar field
|
||||
.Op Fl 2 Ar field
|
||||
.Op Fl o Ar list
|
||||
.Op Fl e Ar string
|
||||
.Op Fl a Ar fileno | Fl v Ar fileno
|
||||
.Op Fl t Ar delim
|
||||
.Ar file1 file2
|
||||
.Sh DESCRIPTION
|
||||
.Nm
|
||||
lines from
|
||||
.Ar file1
|
||||
and
|
||||
.Ar file2
|
||||
on a matching field. If one of the input files is '-', standard input
|
||||
is read for that file.
|
||||
.Pp
|
||||
Files are read sequentially and are assumed to be sorted on the join
|
||||
field.
|
||||
.Nm
|
||||
does not check the order of input, and joining two unsorted files will
|
||||
produce unexpected output.
|
||||
.Pp
|
||||
By default, input lines are matched on the first blank-separated
|
||||
field; output lines are space-separated and consist of the join field
|
||||
followed by the remaining fields from
|
||||
.Ar file1 Ns ,
|
||||
then the remaining fields from
|
||||
.Ar file2 Ns .
|
||||
.Sh OPTIONS
|
||||
.Bl -tag -width Ds
|
||||
.It Fl 1 Ar field
|
||||
Join on the
|
||||
.Ar field Ns eth
|
||||
field of file 1.
|
||||
.It Fl 2 Ar field
|
||||
Join on the
|
||||
.Ar field Ns eth
|
||||
field of file 2.
|
||||
.It Fl a Ar fileno
|
||||
Print unpairable lines from file
|
||||
.Ar fileno
|
||||
in addition to normal output.
|
||||
.It Fl e Ar string
|
||||
When used with
|
||||
.Fl o Ns ,
|
||||
replace empty fields in the output list with
|
||||
.Ar string Ns .
|
||||
.It Fl o Ar list
|
||||
Format output according to the string
|
||||
.Ar list Ns .
|
||||
Each element of
|
||||
.Ar list
|
||||
may be either
|
||||
.Ar fileno.field
|
||||
or 0 (representing the join field).
|
||||
Elements in
|
||||
.Ar list
|
||||
may be separated by blanks or commas. For example,
|
||||
.Bd -literal -offset indent
|
||||
join -o "0 2.1 1.3"
|
||||
.Ed
|
||||
.Pp
|
||||
would print the join field, the first field of
|
||||
.Ar file2 Ns ,
|
||||
then the third field of
|
||||
.Ar file1 Ns .
|
||||
.Pp
|
||||
Only paired lines are formatted with the
|
||||
.Fl o
|
||||
option. Unpairable lines (selected with
|
||||
.Fl a
|
||||
or
|
||||
.Fl v Ns )
|
||||
are printed raw.
|
||||
.It Fl t Ar delim
|
||||
Use the arbitrary string
|
||||
.Ar delim
|
||||
as field delimiter for both input and output.
|
||||
.It Fl v Ar fileno
|
||||
Print unpairable lines from file
|
||||
.Ar fileno
|
||||
instead of normal output.
|
||||
.El
|
||||
.Sh STANDARDS
|
||||
The
|
||||
.Nm
|
||||
utility is compliant with the
|
||||
.St -p1003.1-2013
|
||||
specification with the following exeption:
|
||||
.Bl -bullet -offset indent
|
||||
.It
|
||||
Unpairable lines ignore formatting specified with
|
||||
.Fl o Ns .
|
||||
.El
|
||||
.Pp
|
||||
The possibility of specifying multibyte delimiters of arbitrary
|
||||
length is an extension to the specification.
|
554
join.c
Normal file
554
join.c
Normal file
|
@ -0,0 +1,554 @@
|
|||
#include <ctype.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "arg.h"
|
||||
#include "text.h"
|
||||
#include "utf.h"
|
||||
#include "util.h"
|
||||
|
||||
enum {
|
||||
INIT = 1,
|
||||
GROW = 2,
|
||||
};
|
||||
|
||||
enum {
|
||||
EXPAND = 0,
|
||||
RESET = 1,
|
||||
};
|
||||
|
||||
enum { FIELD_ERROR = -2, };
|
||||
|
||||
struct field {
|
||||
char *s;
|
||||
size_t len;
|
||||
};
|
||||
|
||||
struct line {
|
||||
char *text;
|
||||
size_t nf;
|
||||
size_t maxf;
|
||||
struct field *fields;
|
||||
};
|
||||
|
||||
struct spec {
|
||||
size_t fileno;
|
||||
size_t fldno;
|
||||
};
|
||||
|
||||
struct outlist {
|
||||
size_t ns;
|
||||
size_t maxs;
|
||||
struct spec **specs;
|
||||
};
|
||||
|
||||
struct span {
|
||||
size_t nl;
|
||||
size_t maxl;
|
||||
struct line **lines;
|
||||
};
|
||||
|
||||
static char *sep = NULL;
|
||||
static char *replace = NULL;
|
||||
static const char defaultofs = ' ';
|
||||
static const int jfield = 1; /* POSIX default join field */
|
||||
static int unpairsa = 0, unpairsb = 0;
|
||||
static int oflag = 0;
|
||||
static int pairs = 1;
|
||||
static size_t seplen;
|
||||
static struct outlist output;
|
||||
|
||||
char *argv0;
|
||||
|
||||
|
||||
static void
|
||||
usage(void)
|
||||
{
|
||||
eprintf("usage: %s [-1 field] [-2 field] [-o list] [-e string] "
|
||||
"[-a | -v fileno] [-t delim] file1 file2\n", argv0);
|
||||
}
|
||||
|
||||
static void
|
||||
prfield(struct field *fp)
|
||||
{
|
||||
if (fwrite(fp->s, 1, fp->len, stdout) != fp->len)
|
||||
eprintf("fwrite:");
|
||||
}
|
||||
|
||||
static void
|
||||
prsep(void)
|
||||
{
|
||||
if (sep)
|
||||
fwrite(sep, 1, seplen, stdout);
|
||||
else
|
||||
putchar(defaultofs);
|
||||
}
|
||||
|
||||
static void
|
||||
swaplines(struct line *la, struct line *lb)
|
||||
{
|
||||
struct line tmp;
|
||||
|
||||
tmp = *la;
|
||||
*la = *lb;
|
||||
*lb = tmp;
|
||||
}
|
||||
|
||||
static void
|
||||
prjoin(struct line *la, struct line *lb, size_t jfa, size_t jfb)
|
||||
{
|
||||
struct spec *sp;
|
||||
struct field *joinfield;
|
||||
size_t i;
|
||||
|
||||
if (jfa >= la->nf || jfb >= lb->nf)
|
||||
return;
|
||||
|
||||
joinfield = &la->fields[jfa];
|
||||
|
||||
if (oflag) {
|
||||
for (i = 0; i < output.ns; i++) {
|
||||
sp = output.specs[i];
|
||||
|
||||
if (sp->fileno == 1) {
|
||||
if (sp->fldno < la->nf)
|
||||
prfield(&la->fields[sp->fldno]);
|
||||
else if (replace)
|
||||
fputs(replace, stdout);
|
||||
} else if (sp->fileno == 2) {
|
||||
if (sp->fldno < lb->nf)
|
||||
prfield(&lb->fields[sp->fldno]);
|
||||
else if (replace)
|
||||
fputs(replace, stdout);
|
||||
} else if (sp->fileno == 0) {
|
||||
prfield(joinfield);
|
||||
}
|
||||
|
||||
if (i < output.ns - 1)
|
||||
prsep();
|
||||
}
|
||||
} else {
|
||||
prfield(joinfield);
|
||||
prsep();
|
||||
|
||||
for (i = 0; i < la->nf; i++) {
|
||||
if (i != jfa) {
|
||||
prfield(&la->fields[i]);
|
||||
prsep();
|
||||
}
|
||||
}
|
||||
for (i = 0; i < lb->nf; i++) {
|
||||
if (i != jfb) {
|
||||
prfield(&lb->fields[i]);
|
||||
if (i < la->nf - 1)
|
||||
prsep();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
putchar('\n');
|
||||
}
|
||||
|
||||
static void
|
||||
prline(struct line *lp)
|
||||
{
|
||||
size_t len = strlen(lp->text);
|
||||
|
||||
if (fwrite(lp->text, 1, len, stdout) != len)
|
||||
eprintf("fwrite:");
|
||||
|
||||
putchar('\n');
|
||||
}
|
||||
|
||||
static int
|
||||
linecmp(struct line *la, struct line *lb, size_t jfa, size_t jfb)
|
||||
{
|
||||
int status;
|
||||
|
||||
/* return FIELD_ERROR if both lines are short */
|
||||
if (jfa >= la->nf) {
|
||||
status = jfb >= lb->nf ? FIELD_ERROR : -1;
|
||||
} else if (jfb >= lb->nf) {
|
||||
status = 1;
|
||||
} else {
|
||||
status = memcmp(la->fields[jfa].s, lb->fields[jfb].s,
|
||||
MAX (la->fields[jfa].len, lb->fields[jfb].len));
|
||||
if (status > 0)
|
||||
status = 1;
|
||||
else if (status < 0)
|
||||
status = -1;
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static void
|
||||
addfield(struct line *lp, char *sp, size_t len)
|
||||
{
|
||||
if (lp->nf >= lp->maxf) {
|
||||
lp->fields = ereallocarray(lp->fields, (GROW * lp->maxf),
|
||||
sizeof(struct field));
|
||||
lp->maxf *= GROW;
|
||||
}
|
||||
lp->fields[lp->nf].s = sp;
|
||||
lp->fields[lp->nf].len = len;
|
||||
lp->nf++;
|
||||
}
|
||||
|
||||
static void
|
||||
prspanjoin(struct span *spa, struct span *spb, size_t jfa, size_t jfb)
|
||||
{
|
||||
size_t i, j;
|
||||
|
||||
for (i = 0; i < (spa->nl - 1); i++)
|
||||
for (j = 0; j < (spb->nl - 1); j++)
|
||||
prjoin(spa->lines[i], spb->lines[j], jfa, jfb);
|
||||
}
|
||||
|
||||
static struct line *
|
||||
makeline(char *s, size_t len)
|
||||
{
|
||||
struct line *lp;
|
||||
char *sp, *beg, *end;
|
||||
size_t i;
|
||||
int eol = 0;
|
||||
|
||||
if (s[len-1] == '\n')
|
||||
s[len-1] = '\0';
|
||||
|
||||
lp = ereallocarray(NULL, INIT, sizeof(struct line));
|
||||
lp->text = s;
|
||||
lp->fields = ereallocarray(NULL, INIT, sizeof(struct field));
|
||||
lp->nf = 0;
|
||||
lp->maxf = INIT;
|
||||
|
||||
for (sp = lp->text; isblank(*sp); sp++)
|
||||
;
|
||||
|
||||
while (!eol) {
|
||||
beg = sp;
|
||||
|
||||
if (sep) {
|
||||
if (!(end = utfutf(sp, sep)))
|
||||
eol = 1;
|
||||
|
||||
if (!eol) {
|
||||
addfield(lp, beg, end - beg);
|
||||
for (i = 0; i < seplen; i++)
|
||||
end++;
|
||||
}
|
||||
} else {
|
||||
for (end = sp; !(isblank(*end)); end++) {
|
||||
if (*end == '\0') {
|
||||
eol = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!eol)
|
||||
addfield(lp, beg, end - beg);
|
||||
while (isblank(*++end))
|
||||
;
|
||||
}
|
||||
|
||||
if (eol)
|
||||
addfield(lp, beg, strlen(sp));
|
||||
|
||||
sp = end;
|
||||
}
|
||||
|
||||
return lp;
|
||||
}
|
||||
|
||||
static int
|
||||
addtospan(struct span *sp, FILE *fp, int reset)
|
||||
{
|
||||
char *newl = NULL;
|
||||
size_t len, size = 0;
|
||||
|
||||
if ((len = getline(&newl, &size, fp)) == -1) {
|
||||
if (ferror(fp))
|
||||
eprintf("getline:");
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (reset)
|
||||
sp->nl = 0;
|
||||
|
||||
if (sp->nl >= sp->maxl) {
|
||||
sp->lines = ereallocarray(sp->lines, (GROW * sp->maxl),
|
||||
sizeof(struct line *));
|
||||
sp->maxl *= GROW;
|
||||
}
|
||||
|
||||
sp->lines[sp->nl] = makeline(newl, len);
|
||||
sp->nl++;
|
||||
return 1;
|
||||
}
|
||||
|
||||
static void
|
||||
initspan(struct span *sp)
|
||||
{
|
||||
sp->nl = 0;
|
||||
sp->maxl = INIT;
|
||||
sp->lines = ereallocarray(NULL, INIT, sizeof(struct line *));;
|
||||
}
|
||||
|
||||
static void
|
||||
freespan(struct span *sp)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < sp->nl; i++) {
|
||||
free(sp->lines[i]->fields);
|
||||
free(sp->lines[i]->text);
|
||||
}
|
||||
|
||||
free(sp->lines);
|
||||
}
|
||||
|
||||
static void
|
||||
initolist(struct outlist *olp)
|
||||
{
|
||||
olp->ns = 0;
|
||||
olp->maxs = 1;
|
||||
olp->specs = ereallocarray(NULL, INIT, sizeof(struct spec *));
|
||||
}
|
||||
|
||||
static void
|
||||
addspec(struct outlist *olp, struct spec *sp)
|
||||
{
|
||||
if (olp->ns >= olp->maxs) {
|
||||
olp->specs = ereallocarray(olp->specs, (GROW * olp->maxs),
|
||||
sizeof(struct spec *));
|
||||
olp->maxs *= GROW;
|
||||
}
|
||||
olp->specs[olp->ns] = sp;
|
||||
olp->ns++;
|
||||
}
|
||||
|
||||
static struct spec *
|
||||
makespec(char *s)
|
||||
{
|
||||
struct spec *sp;
|
||||
int fileno;
|
||||
size_t fldno;
|
||||
|
||||
switch (s[0]) {
|
||||
case '0': /* join field */
|
||||
fileno = 0;
|
||||
fldno = 0;
|
||||
break;
|
||||
case '1': case '2':
|
||||
if (sscanf(s, "%d.%zu", &fileno, &fldno) != 2)
|
||||
eprintf("\"%s\": invalid format\n", s);
|
||||
fldno--; /* ugly */
|
||||
break;
|
||||
default:
|
||||
eprintf("%c: invalid file number (must be 0, 1 or 2)\n", s[0]);
|
||||
break;
|
||||
}
|
||||
|
||||
sp = ereallocarray(NULL, INIT, sizeof(struct spec));
|
||||
sp->fileno = fileno;
|
||||
sp->fldno = fldno;
|
||||
return sp;
|
||||
}
|
||||
|
||||
static void
|
||||
makeolist(struct outlist *olp, char *s)
|
||||
{
|
||||
char *item, *sp;
|
||||
sp = s;
|
||||
|
||||
while (sp) {
|
||||
item = sp;
|
||||
sp = strpbrk(sp, ", \t");
|
||||
if (sp)
|
||||
*sp++ = '\0';
|
||||
addspec(olp, makespec(item));
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
freespecs(struct outlist *olp)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < olp->ns; i++)
|
||||
free(olp->specs[i]);
|
||||
}
|
||||
|
||||
static void
|
||||
join(FILE *fa, FILE *fb, size_t jfa, size_t jfb)
|
||||
{
|
||||
struct span spa, spb;
|
||||
int cmp, eofa, eofb;
|
||||
|
||||
initspan(&spa);
|
||||
initspan(&spb);
|
||||
cmp = eofa = eofb = 0;
|
||||
|
||||
addtospan(&spa, fa, RESET);
|
||||
addtospan(&spb, fb, RESET);
|
||||
|
||||
while (spa.nl && spb.nl) {
|
||||
if ((cmp = linecmp(spa.lines[0], spb.lines[0], jfa, jfb)) < 0) {
|
||||
if (unpairsa)
|
||||
prline(spa.lines[0]);
|
||||
if (!addtospan(&spa, fa, RESET)) {
|
||||
if (unpairsb) { /* a is EOF'd; print the rest of b */
|
||||
do
|
||||
prline(spb.lines[0]);
|
||||
while (addtospan(&spb, fb, RESET));
|
||||
}
|
||||
eofa = eofb = 1;
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
} else if (cmp > 0) {
|
||||
if (unpairsb)
|
||||
prline(spb.lines[0]);
|
||||
if (!addtospan(&spb, fb, RESET)) {
|
||||
if (unpairsa) { /* b is EOF'd; print the rest of a */
|
||||
do
|
||||
prline(spa.lines[0]);
|
||||
while (addtospan(&spa, fa, RESET));
|
||||
}
|
||||
eofa = eofb = 1;
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
} else if (cmp == 0) {
|
||||
/* read all consecutive matching lines from a */
|
||||
do {
|
||||
if (!addtospan(&spa, fa, EXPAND)) {
|
||||
eofa = 1;
|
||||
spa.nl++;
|
||||
break;
|
||||
}
|
||||
} while (linecmp(spa.lines[spa.nl-1], spb.lines[0], jfa, jfb) == 0);
|
||||
|
||||
/* read all consecutive matching lines from b */
|
||||
do {
|
||||
if (!addtospan(&spb, fb, EXPAND)) {
|
||||
eofb = 1;
|
||||
spb.nl++;
|
||||
break;
|
||||
}
|
||||
} while (linecmp(spa.lines[0], spb.lines[spb.nl-1], jfa, jfb) == 0);
|
||||
|
||||
if (pairs)
|
||||
prspanjoin(&spa, &spb, jfa, jfb);
|
||||
|
||||
} else { /* FIELD_ERROR: both lines lacked join fields */
|
||||
if (unpairsa)
|
||||
prline(spa.lines[0]);
|
||||
if (unpairsb)
|
||||
prline(spb.lines[0]);
|
||||
eofa = addtospan(&spa, fa, RESET) ? 0 : 1;
|
||||
eofb = addtospan(&spb, fb, RESET) ? 0 : 1;
|
||||
if (!eofa && !eofb)
|
||||
continue;
|
||||
}
|
||||
|
||||
if (eofa) {
|
||||
spa.nl = 0;
|
||||
} else {
|
||||
swaplines(spa.lines[0], spa.lines[spa.nl - 1]); /* ugly */
|
||||
spa.nl = 1;
|
||||
}
|
||||
|
||||
if (eofb) {
|
||||
spb.nl = 0;
|
||||
} else {
|
||||
swaplines(spb.lines[0], spb.lines[spb.nl - 1]); /* ugly */
|
||||
spb.nl = 1;
|
||||
}
|
||||
}
|
||||
freespan(&spa);
|
||||
freespan(&spb);
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
main(int argc, char *argv[])
|
||||
{
|
||||
size_t jf[2] = { jfield, jfield, };
|
||||
FILE *fp[2];
|
||||
int n;
|
||||
char *fno;
|
||||
|
||||
ARGBEGIN {
|
||||
case '1':
|
||||
jf[0] = estrtonum(EARGF(usage()), 1, MIN(LLONG_MAX, SIZE_MAX));
|
||||
break;
|
||||
case '2':
|
||||
jf[1] = estrtonum(EARGF(usage()), 1, MIN(LLONG_MAX, SIZE_MAX));
|
||||
break;
|
||||
case 'a':
|
||||
fno = EARGF(usage());
|
||||
if (strcmp(fno, "1") == 0)
|
||||
unpairsa = 1;
|
||||
else if (strcmp(fno, "2") == 0)
|
||||
unpairsb = 1;
|
||||
else
|
||||
usage();
|
||||
break;
|
||||
case 'e':
|
||||
replace = EARGF(usage());
|
||||
break;
|
||||
case 'o':
|
||||
oflag = 1;
|
||||
initolist(&output);
|
||||
makeolist(&output, EARGF(usage()));
|
||||
break;
|
||||
case 't':
|
||||
sep = EARGF(usage());
|
||||
break;
|
||||
case 'v':
|
||||
pairs = 0;
|
||||
fno = EARGF(usage());
|
||||
if (strcmp(fno, "1") == 0)
|
||||
unpairsa = 1;
|
||||
else if (strcmp(fno, "2") == 0)
|
||||
unpairsb = 1;
|
||||
else
|
||||
usage();
|
||||
break;
|
||||
default:
|
||||
usage();
|
||||
} ARGEND;
|
||||
|
||||
if (sep)
|
||||
seplen = unescape(sep);
|
||||
|
||||
if (argc != 2)
|
||||
usage();
|
||||
|
||||
for (n = 0; n < 2; n++) {
|
||||
if (argv[n][0] == '-' && !argv[n][1]) {
|
||||
argv[n] = "<stdin>";
|
||||
fp[n] = stdin;
|
||||
} else if (!(fp[n] = fopen(argv[n], "r"))) {
|
||||
eprintf("fopen %s:", argv[n]);
|
||||
}
|
||||
}
|
||||
|
||||
jf[0]--;
|
||||
jf[1]--;
|
||||
|
||||
join(fp[0], fp[1], jf[0], jf[1]);
|
||||
|
||||
if (oflag)
|
||||
freespecs(&output);
|
||||
|
||||
enfshut(2, fp[0], argv[0]);
|
||||
if (fp[0] != fp[1])
|
||||
enfshut(2, fp[1], argv[1]);
|
||||
enfshut(2, stdout, "<stdout>");
|
||||
exit(0);
|
||||
}
|
Loading…
Reference in New Issue
Block a user