Discussion:
Wrapper for glob() that implements /**/ sub-pattern.
(too old to reply)
Kaz Kylheku
2023-09-12 03:49:16 UTC
Permalink
Hi all,

I'm experimenting with a wrapper function that drop-in replaces
for the POSIX glob, but gives it the /**/ superpower.

The /**/ pattern matches zero or more path components.

I have a prototype here which works like this.

If the /**/ sub-pattern does not occur in pattern, then
it just calls glob, passing it all its parameters.

If the /**/ sub-pattern occurs in the pattern, then
it iterates on it, successively replacing it with
/, /*/, /*/*/, /*/*/*/, ... and calling itself recursively.

After the first recursive call, it adds GLOB_APPEND
to the flags.

There are issues to do with termination (when do we stop?)
and performance.

In the prototype, I have the recursion generate a maximum of 48 /*/ star
wildcards across the entire path, and each /**/ pattern can individually
expand to no more than 10.

Multiple occurrences of /**/ drag down the performance of the prototype
badly. Up to three is what I would call practical.

The real function should handle patterns starting with "**/" and also
ending in "/**", as well as when "**" is the entire pattern.

Plus there are issues of sorting. We might want to collect results with
GLOB_NOSORT and sort the paths ourselves.

I'm already thinking forward to a different algorithm, but
here is the prototype.

#include <glob.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>

static int super_glob_rec(const char *pattern, int flags,
int (*errfunc) (const char *epath, int eerrno),
glob_t *pglob, size_t star_limit)
{
const char *dblstar = strstr(pattern, "/**/");

if (dblstar == 0) {
return glob(pattern, flags, errfunc, pglob);
} else {
size_t i, base_len = strlen(pattern);
size_t ds_off = dblstar - pattern + 1;
size_t tail_off = ds_off + 3;
size_t limit = star_limit > 10 ? 10 : star_limit;

for (i = 0; i < limit; i++) {
size_t space = base_len - 3 + i * 2;
char *pat_copy = malloc(space + 1);
size_t j;
char *out = pat_copy + ds_off;
int res;

strncpy(pat_copy, pattern, ds_off);

for (j = 0; j < i; j++) {
*out++ = '*';
*out++ = '/';
}

strcpy(out, pattern + tail_off);

if (i > 0)
flags |= GLOB_APPEND;

res = super_glob_rec(pat_copy, flags, errfunc, pglob, star_limit - i);

free(pat_copy);

if (res && res != GLOB_NOMATCH)
return res;
}

return 0;
}
}

static int super_glob(const char *pattern, int flags,
int (*errfunc) (const char *epath, int eerrno),
glob_t *pglob)
{
return super_glob_rec(pattern, flags, errfunc, pglob, 48);
}


int main(int argc, char **argv)
{
int status = EXIT_FAILURE;

if (argc == 2) {
glob_t glb;
int res = super_glob(argv[1], 0, NULL, &glb);
if (res && res != GLOB_NOMATCH) {
fprintf(stderr, "%s: glob failed with %d\n", argv[0], res);
} else {
for (size_t i = 0; i < glb.gl_pathc; i++)
puts(glb.gl_pathv[i]);
}
globfree(&glb);
} else if (argc == 1) {
fprintf(stderr, "%s: specify one glob pattern argument\n", argv[0]);
}

return status;
}
--
TXR Programming Language: http://nongnu.org/txr
Cygnal: Cygwin Native Application Library: http://kylheku.com/cygnal
Mastodon: @***@mstdn.ca
NOTE: If you use Google Groups, I don't see you, unless you're whitelisted.
Kaz Kylheku
2023-09-12 06:47:05 UTC
Permalink
Post by Kaz Kylheku
The real function should handle patterns starting with "**/" and also
ending in "/**", as well as when "**" is the entire pattern.
I fixed this in the prototype.

#include <glob.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>

static int super_glob_rec(const char *pattern, int flags,
int (*errfunc) (const char *epath, int eerrno),
glob_t *pglob, size_t star_limit)
{
const char *dblstar = 0;

if (strncmp(pattern, "**/", 3) == 0 || strcmp(pattern, "**") == 0) {
dblstar = pattern;
} else if ((dblstar = strstr(pattern, "/**/")) != 0) {
dblstar++;
} else if (strlen(pattern) >= 3) {
const char *end = pattern + strlen(pattern);
if (strcmp(end - 3, "/**") == 0)
dblstar = end - 2;
}

if (dblstar == 0) {
return glob(pattern, flags, errfunc, pglob);
} else {
size_t i, base_len = strlen(pattern);
size_t ds_off = dblstar - pattern;
size_t tail_off = ds_off + 2;
size_t limit = star_limit > 10 ? 10 : star_limit;

for (i = 0; i < limit; i++) {
size_t space = base_len - 3 + i * 2;
char *pat_copy = malloc(space + 2);
size_t j;
char *out = pat_copy + ds_off;
int res;

strncpy(pat_copy, pattern, ds_off);

for (j = 0; j < i; j++) {
*out++ = '*';
if (j < i - 1)
*out++ = '/';
}

strcpy(out, pattern + tail_off);

if (i > 0)
flags |= GLOB_APPEND;

res = super_glob_rec(pat_copy, flags, errfunc, pglob, star_limit - i);

free(pat_copy);

if (res && res != GLOB_NOMATCH)
return res;
}

return 0;
}
}

static int super_glob(const char *pattern, int flags,
int (*errfunc) (const char *epath, int eerrno),
glob_t *pglob)
{
return super_glob_rec(pattern, flags, errfunc, pglob, 48);
}


int main(int argc, char **argv)
{
int status = EXIT_FAILURE;

if (argc == 2) {
glob_t glb;
int res = super_glob(argv[1], 0, NULL, &glb);
if (res && res != GLOB_NOMATCH) {
fprintf(stderr, "%s: glob failed with %d\n", argv[0], res);
} else {
for (size_t i = 0; i < glb.gl_pathc; i++)
puts(glb.gl_pathv[i]);
}
globfree(&glb);
} else if (argc == 1) {
fprintf(stderr, "%s: specify one glob pattern argument\n", argv[0]);
}

return status;
}
Kaz Kylheku
2023-09-12 17:17:17 UTC
Permalink
Post by Kaz Kylheku
Post by Kaz Kylheku
The real function should handle patterns starting with "**/" and also
ending in "/**", as well as when "**" is the entire pattern.
I fixed this in the prototype.
Issues:

1. Sorting with brace expansions:

Brace expansion, is supported in glibc's glob via GLOB_BRACE.

The order of the brace substitutions has to be preserved
through the globbing.

Given a glob pattern "alpha{beta,gamma}omega", it has to be
expanded into "alphabetaomega" and "alphagammaomega" which
are separately matched, sorted, and then combined together in
that order.

It has to be that way, otherwise it will wreck the semantics of
command lines generated with brace expansion.

E.g. *.{foo,bar} has to behave like *.foo *.bar where all the
.foo files list before .bar files.

That means if you want to implement some new glob semantics on top
of glob, you have to intercept GLOB_BRACE and do it yourself;
you can't just apply a sorting pass to the total expansion.

2. Escaping

The interior /**/ pattern could occur in a class like [abc/**/def]
in which case it must not be recognized.
William Ahern
2023-09-14 01:56:11 UTC
Permalink
Post by Kaz Kylheku
Post by Kaz Kylheku
Post by Kaz Kylheku
The real function should handle patterns starting with "**/" and also
ending in "/**", as well as when "**" is the entire pattern.
I fixed this in the prototype.
<snip>
Post by Kaz Kylheku
2. Escaping
The interior /**/ pattern could occur in a class like [abc/**/def]
in which case it must not be recognized.
FWIW, OpenBSD sh seems not to tolerate slashes in bracket expressions:

$ ls *
bar foo foo*bar
$ ls *[*]*
foo*bar
$ ls *[*z]*
foo*bar
$ ls *[*/]*
ls: *[*/]*: No such file or directory

It seems to be [recursively] splitting on slash before pattern matching on
filenames. See line 1086 in globit at
https://cvsweb.openbsd.org/cgi-bin/cvsweb/src/bin/ksh/eval.c?annotate=1.67

And this behavior seems to be POSIX compliant:

2.13.3 Patterns Used for Filename Expansion

The rules described so far in Patterns Matching a Single Character and
Patterns Matching Multiple Characters are qualified by the following rules
that apply when pattern matching notation is used for filename expansion:

1. The <slash> character in a pathname shall be explicitly matched by
using one or more <slash> characters in the pattern; it shall neither
be matched by the <asterisk> or <question-mark> special characters nor
by a bracket expression. <slash> characters in the pattern shall be
identified before bracket expressions; thus, a <slash> cannot be
included in a pattern bracket expression used for filename expansion.

At first I was wondering why you thought you could get away with merely
scanning for slash+double-star and double-star+slash--bracket expressions
obviously require stateful parsing. But clearly POSIX anticipates (or even
expects) that shells would process slashes before parsing component
patterns. I believe this allowance/limitation would likewise apply to
glob(3), which specially mentions rule #3 under 2.13.3; unlike case and
fnmatch(3), which are normally used for generic string matching.

But I guess none of that is helpful if you're trying to match some
sophisticated Bash behavior.
William Ahern
2023-09-14 02:33:16 UTC
Permalink
Post by William Ahern
Post by Kaz Kylheku
Post by Kaz Kylheku
Post by Kaz Kylheku
The real function should handle patterns starting with "**/" and also
ending in "/**", as well as when "**" is the entire pattern.
I fixed this in the prototype.
<snip>
Post by Kaz Kylheku
2. Escaping
The interior /**/ pattern could occur in a class like [abc/**/def]
in which case it must not be recognized.
<snip>
Post by William Ahern
At first I was wondering why you thought you could get away with merely
scanning for slash+double-star and double-star+slash--bracket expressions
obviously require stateful parsing. But clearly POSIX anticipates (or even
expects) that shells would process slashes before parsing component
patterns. I believe this allowance/limitation would likewise apply to
glob(3), which specially mentions rule #3 under 2.13.3; unlike case and
fnmatch(3), which are normally used for generic string matching.
I'm glad I dug into this as I may have missed this caveat several years ago
when experimenting with a recursive glob command in POSIX shell:

https://25thandClement.com/~william/2023/glob.sh

The recursion is implemented by recursively appending '*/' to the directory
prefix, rather than with an in-pattern construct, either because I forgot
about the '/**/' extension or assumed it wasn't possible without more
cumbersome, stateful parsing of the pattern string. Though most of the
implementation is preoccupped with how to safely communicate filenames with
special characters--particularly whitespace--through pipelines using only
fast shell commands like printf (as opposed to od).

I wish I had pushed this up to GitHub as I rely on the (sh) printf %b format
specifier for decoding encoded filenames. POSIX sh printf %b clashes with
the newly defined %b in C2x. There's a discussion on the Open Group
mailing-list about whether POSIX 202x/SuSv5 should deprecate %b, and someone
did a query across GitHub code only to find that almost all the usages of %b
are the same copy-pasted code which could be easily replaced if POSIX
deprecated/removed the old %b specifier.
Nuno Silva
2023-09-14 08:59:32 UTC
Permalink
On 2023-09-14, William Ahern wrote:

[...]
Post by William Ahern
I wish I had pushed this up to GitHub as I rely on the (sh) printf %b format
specifier for decoding encoded filenames. POSIX sh printf %b clashes with
the newly defined %b in C2x. There's a discussion on the Open Group
mailing-list about whether POSIX 202x/SuSv5 should deprecate %b, and someone
did a query across GitHub code only to find that almost all the usages of %b
are the same copy-pasted code which could be easily replaced if POSIX
deprecated/removed the old %b specifier.
How complete would such a query be? I mean, are there other
"centralized" web repositories of code with a significant user base?

(There is also code at other locations grouping several projects, FSF's
savannah comes to mind, but I don't think it has a centralized code
search, does it? Same goes for SourceForge?)
--
Nuno Silva
William Ahern
2023-09-16 04:10:46 UTC
Permalink
Post by Nuno Silva
[...]
Post by William Ahern
I wish I had pushed this up to GitHub as I rely on the (sh) printf %b format
specifier for decoding encoded filenames. POSIX sh printf %b clashes with
the newly defined %b in C2x. There's a discussion on the Open Group
mailing-list about whether POSIX 202x/SuSv5 should deprecate %b, and someone
did a query across GitHub code only to find that almost all the usages of %b
are the same copy-pasted code which could be easily replaced if POSIX
deprecated/removed the old %b specifier.
How complete would such a query be? I mean, are there other
"centralized" web repositories of code with a significant user base?
(There is also code at other locations grouping several projects, FSF's
savannah comes to mind, but I don't think it has a centralized code
search, does it? Same goes for SourceForge?)
I don't know how much credence anyone on the mailing-list gives to that
GitHub search, nor do I know where the balance of opinion stands on
deprecating %b. And unfortunately most of the discussion occured on a
conference call (which I wasn't privy to) and the mailing-list. Though it
seems to have spilled over to an Enhancement Request regarding %q,
https://www.austingroupbugs.net/view.php?id=1771, and the GNU coreutils
mailing-list, https://lists.gnu.org/r/bug-coreutils/2023-09/msg00015.html

I don't believe the mailing-list archive is public, but subscribing is open.
You first need to create an account on opengroup.org. The mailing-list is
austin-group-***@opengroup.org, but the portal website is byzantine and I
can't even find the page showing my subscription to the mailing-list.
Adam Sampson
2023-09-16 13:13:39 UTC
Permalink
Post by Nuno Silva
How complete would such a query be? I mean, are there other
"centralized" web repositories of code with a significant user base?
I find codesearch.debian.net very useful for this kind of thing -- it
allows regex searches across all of Debian's source packages. This
includes both widely-used and obscure software, although it only covers
packages in the current development version of Debian so older code
won't be included.
--
Adam Sampson <***@offog.org> <http://offog.org/>
Kaz Kylheku
2023-09-14 03:34:12 UTC
Permalink
Post by William Ahern
Post by Kaz Kylheku
Post by Kaz Kylheku
Post by Kaz Kylheku
The real function should handle patterns starting with "**/" and also
ending in "/**", as well as when "**" is the entire pattern.
I fixed this in the prototype.
<snip>
Post by Kaz Kylheku
2. Escaping
The interior /**/ pattern could occur in a class like [abc/**/def]
in which case it must not be recognized.
Yes; and it doesn't make sense. Or not in glob, anyway.

Nevertheless, a star glob preprocessor above glob should heed the class
syntax and not treat /**/; just pass that through to glob and let it
fail.

Matching slashes with class syntax makes sense in situations when
we are not matching paths. Or matching paths more freely.
obvously it's allowed in a POSIX shell case statement, and in fnmatch()
(in the absence of FNM_PATHNAME) and and so on.
Post by William Ahern
At first I was wondering why you thought you could get away with merely
scanning for slash+double-star and double-star+slash--bracket expressions
obviously require stateful parsing.
I was initially after the behavior: proof-of-concept. When things have
driven around the block, then we tighten the lugnuts.

In the current version I have it look for ** being the whole thing,
starting with **/, ending with /** or containing /**/ where the
leading / is not escaped or in a character class.

I think I may havae a bug: for detecting the trailing /**, we should
avoid interpreting it if it follows a backslash; let glob deal
with it.
Post by William Ahern
But I guess none of that is helpful if you're trying to match some
sophisticated Bash behavior.
I used this to cob together a function called glob* that is now
integrated in TXR Lisp.

The current glob function is a wrapper for glob written in C,
which now calls superglob if the extension flag GLOB_XSTAR is present.

glob accepts a list of patterns, not just a single pattern; the results
from multiple patterns are catenated.

glob* is written in Lisp, and performs its own brace expansion to
generate a list of patterns passed to glob with GLOB_XSTAR.

(I had once submitted a TXR Lisp brace expansion solution to Rosetta
Code; I integrated that ready-made solution.)

I've tested on Linux, MacOS and Cygwin so far; it's looking good.
Will check Solaris 10.
--
TXR Programming Language: http://nongnu.org/txr
Cygnal: Cygwin Native Application Library: http://kylheku.com/cygnal
Mastodon: @***@mstdn.ca
NOTE: If you use Google Groups, I don't see you, unless you're whitelisted.
Kaz Kylheku
2023-09-14 06:17:37 UTC
Permalink
Post by Kaz Kylheku
Post by William Ahern
Post by Kaz Kylheku
Post by Kaz Kylheku
Post by Kaz Kylheku
The real function should handle patterns starting with "**/" and also
ending in "/**", as well as when "**" is the entire pattern.
I fixed this in the prototype.
<snip>
Post by Kaz Kylheku
2. Escaping
The interior /**/ pattern could occur in a class like [abc/**/def]
in which case it must not be recognized.
Yes; and it doesn't make sense. Or not in glob, anyway.
Nevertheless, a star glob preprocessor above glob should heed the class
syntax and not treat /**/; just pass that through to glob and let it
fail.
Matching slashes with class syntax makes sense in situations when
we are not matching paths. Or matching paths more freely.
obvously it's allowed in a POSIX shell case statement, and in fnmatch()
(in the absence of FNM_PATHNAME) and and so on.
Post by William Ahern
At first I was wondering why you thought you could get away with merely
scanning for slash+double-star and double-star+slash--bracket expressions
obviously require stateful parsing.
I was initially after the behavior: proof-of-concept. When things have
driven around the block, then we tighten the lugnuts.
In the current version I have it look for ** being the whole thing,
starting with **/, ending with /** or containing /**/ where the
leading / is not escaped or in a character class.
I think I may havae a bug: for detecting the trailing /**, we should
avoid interpreting it if it follows a backslash; let glob deal
with it.
Post by William Ahern
But I guess none of that is helpful if you're trying to match some
sophisticated Bash behavior.
I used this to cob together a function called glob* that is now
integrated in TXR Lisp.
The current glob function is a wrapper for glob written in C,
which now calls superglob if the extension flag GLOB_XSTAR is present.
glob accepts a list of patterns, not just a single pattern; the results
from multiple patterns are catenated.
glob* is written in Lisp, and performs its own brace expansion to
generate a list of patterns passed to glob with GLOB_XSTAR.
I should mention I fixed the brace expansion sorting issue;
and the general sorting issue.

The individual super_globs calls beneath the glob wrapper do the
sorting: GLOB_NOSORT is passed to glob and then the array is sorted
using qsort.

The brace expansion is processed in glob* which appends the individual
results together in order, so there is no global sort messing things up.

For qsort, I am using the comparison function below.

In contrast, Bash's internal glob function does stupid things,
like use strcoll or strcmp under different circumstnaces.
strcoll falls victim to locale; strcmp is silly for paths.

The following function just sorts on byte without regard for
character set. However, it collates the / character before any other.

So for instance these two entries are in sorted order:

test/
test-dir/

whereas under strcmp(), test-dir would come first because -
is before / in ASCII.

convert is a casting macro; just pretend convert(T, E) is ((T) (E)).

static int glob_path_cmp(const void *ls, const void *rs)
{
const unsigned char *lstr = *convert(const unsigned char * const *, ls);
const unsigned char *rstr = *convert(const unsigned char * const *, rs);

for (; *lstr && *rstr; lstr++, rstr++)
{
if (*lstr == *rstr)
continue;
if (*lstr == '/')
return -1;
if (*rstr == '/')
return 1;
if (*lstr < *rstr)
return -1;
if (*lstr > *rstr)
return 1;
}

if (!*lstr)
return -1;
if (!*rstr)
return 1;

return 0;
}
--
TXR Programming Language: http://nongnu.org/txr
Cygnal: Cygwin Native Application Library: http://kylheku.com/cygnal
Mastodon: @***@mstdn.ca
NOTE: If you use Google Groups, I don't see you, unless you're whitelisted.
Loading...