1 /*
2 shlex, simple shell-like lexical analysis library
3 Copyright (C) 2019  Victor Porton
4 
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9 
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 GNU General Public License for more details.
14 
15 You should have received a copy of the GNU General Public License
16 along with this program.  If not, see <https://www.gnu.org/licenses/>.
17 
18 This code was a rewrite of a Python 3.7 module with the same name:
19 Copyright © 2001-2019 Python Software Foundation; All Rights Reserved
20 */
21 
22 module shlex;
23 
24 import std.typecons;
25 import std.conv;
26 import std.string;
27 import std.utf;
28 import std.regex;
29 import std.array;
30 import std.range.interfaces;
31 import std.range.primitives;
32 import std.container;
33 import std.container.dlist;
34 import std.algorithm;
35 import std.file;
36 import std.path;
37 import std.stdio : write, writeln;
38 import pure_dependency.providers;
39 import struct_params;
40 
41 // TODO: use moveFront()/moveBack()
42 
43 alias ShlexStream = InputRange!(const dchar); // Unicode stream
44 
45 class ShlexFile : InputRange!dchar {
46     private string text;
47 
48     /// The current version reads the file entirely
49     this(string name) {
50         text = readText(name);
51     }
52 
53     override @property dchar front() {
54         return text.front;
55     }
56 
57     override dchar moveFront() {
58         return text.moveFront();
59     }
60 
61     override void popFront() {
62         return text.popFront();
63     }
64 
65     override @property bool empty() {
66         return text.empty;
67     }
68 
69     override int opApply(scope int delegate(dchar) dg) {
70         int res;
71         for (auto r = text; !r.empty; r.popFront()) {
72             res = dg(r.front);
73             if (res) break;
74         }
75         return res;
76     }
77 
78     override int opApply(scope int delegate(size_t, dchar) dg) {
79         int res;
80         size_t i = 0;
81         for (auto r = text; !r.empty; r.popFront()) {
82             res = dg(i, r.front);
83             if (res) break;
84             i++;
85         }
86         return res;
87     }
88 
89     ///
90     void close() { } // we have already read the file
91 }
92 
93 private void skipLine(ShlexStream stream) {
94     while (!stream.empty && stream.front != '\n') stream.popFront();
95     if (!stream.empty && stream.front == '\n') stream.popFront();
96 }
97 
98 /// A lexical analyzer class for simple shell-like syntaxes
99 struct Shlex {
100     alias Posix = Flag!"posix";
101     alias PunctuationChars = Flag!"PunctuationChars";
102     alias Comments = Flag!"comments";
103 
104 private:
105     ShlexStream instream;
106     Nullable!string infile;
107     Posix posix;
108     Nullable!string eof; // seems not efficient
109     //bool delegate(string token) isEof;
110     auto commenters = new RedBlackTree!(immutable dchar)("#");
111     RedBlackTree!(immutable dchar) wordchars;
112     static immutable whitespace = new RedBlackTree!(immutable dchar)(" \t\r\n");
113     bool whitespaceSplit = false;
114     static immutable quotes = new RedBlackTree!(immutable dchar)("'\"");
115     static immutable escape = new RedBlackTree!(immutable dchar)("\\"); // char or string?
116     static immutable escapedquotes = new RedBlackTree!(immutable dchar)("\""); // char or string?
117     Nullable!dchar state = ' '; // a little inefficient?
118     auto pushback = DList!string(); // may be not the fastest
119     uint lineno;
120     ubyte debug_ = 0;
121     string token = "";
122     auto filestack = DList!(Tuple!(Nullable!string, ShlexStream, uint))(); // may be not the fastest
123     Nullable!string source; // TODO: Represent no source just as an empty string?
124     auto punctuationChars = new RedBlackTree!(immutable dchar)();
125     // _pushbackChars is a push back queue used by lookahead logic
126     auto _pushbackChars = DList!dchar(); // may be not the fastest
127 
128 public:
129     @disable this();
130 
131     /** We don't support implicit stdin as `instream` as in Python. */
132     this(ShlexStream instream,
133          Nullable!string infile = Nullable!string.init,
134          Posix posix = No.posix,
135          PunctuationChars punctuationCharsFlag = No.PunctuationChars,
136          bool whitespaceSplit = false)
137     {
138         this.instream = instream;
139         this.infile = infile;
140         this.posix = posix;
141         this.whitespaceSplit = whitespaceSplit;
142         if (!posix) eof = "";
143         wordchars = new RedBlackTree!(immutable dchar)("abcdfeghijklmnopqrstuvwxyz" ~ "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_");
144         if (posix)
145             wordchars.stableInsert("ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ" ~ "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ");
146         lineno = 1;
147         if(punctuationCharsFlag)
148             this.punctuationChars.stableInsert("();<>|&");
149         if (punctuationCharsFlag) {
150             // these chars added because allowed in file names, args, wildcards
151             wordchars.stableInsert("~-./*?=");
152             // remove any punctuation chars from wordchars
153             // TODO: Isn't it better to use dstring?
154             wordchars = new RedBlackTree!(immutable dchar)(filter!(c => c !in punctuationChars)(wordchars.array));
155         }
156     }
157 
158     this(Stream)(Stream instream,
159                  Nullable!string infile = Nullable!string.init,
160                  Posix posix = No.posix,
161                  PunctuationChars punctuationChars = No.PunctuationChars,
162                  bool whitespaceSplit = false)
163     {
164         import std.conv;
165         // TODO: Inefficient to convert to dstring in memory.
166         this(cast (ShlexStream)inputRangeObject(cast (const dchar[])instream.dtext), infile, posix, punctuationChars, whitespaceSplit);
167     }
168 
169     void dump() {
170         if (debug_ >= 3) {
171 //            writeln("state='", state, "\' nextchar='", nextchar, "\' token='", token, '\'');
172             writeln("state='", state, "\' token='", token, '\'');
173         }
174     }
175 
176     /** Push a token onto the stack popped by the getToken method */
177     void pushToken(string tok) {
178         if (debug_ >= 1)
179             writeln("shlex: pushing token " ~ tok);
180         pushback.insertFront(tok);
181     }
182 
183     /** Push an input source onto the lexer's input source stack. */
184     void pushSource(Stream)(Stream newstream, Nullable!string newfile = Nullable!string.init) {
185         pushSource(inputRangeObject(instream), newfile);
186     }
187 
188     /** Push an input source onto the lexer's input source stack. */
189     void pushSource(ShlexStream newstream, Nullable!string newfile = Nullable!string.init) {
190         filestack.insertFront(tuple(this.infile, this.instream, this.lineno));
191         this.infile = newfile;
192         this.instream = newstream;
193         this.lineno = 1;
194         if (debug_) {
195             if (newfile.isNull)
196                 writeln("shlex: pushing to stream %s".format(this.instream));
197             else
198                 writeln("shlex: pushing to file %s".format(this.infile));
199         }
200     }
201 
202     /** Pop the input source stack. */
203     void popSource() {
204         (cast(ShlexFile)instream).close(); // a little messy
205         // use a tuple library?
206         auto t = filestack.front;
207         filestack.removeFront();
208         infile   = t[0];
209         instream = t[1];
210         lineno   = t[2];
211         if (debug_)
212             writeln("shlex: popping to %s, line %d".format(instream, lineno));
213         state = ' ';
214     }
215 
216     // TODO: Use empty string for None?
217     /** Get a token from the input stream (or from stack if it's nonempty).
218         Returns null value on eof. */
219     Nullable!string getToken() {
220         if (!pushback.empty) {
221             immutable tok = pushback.front;
222             pushback.removeFront();
223             if (debug_ >= 1)
224                 writeln("shlex: popping token " ~ tok);
225             return nullable(tok);
226         }
227         // No pushback.  Get a token.
228         Nullable!string raw = readToken();
229         // Handle inclusions
230         if (!source.isNull && !source.empty) {
231             while (raw == source) {
232                 auto spec = sourcehook(readToken());
233                 if (!spec.empty) {
234                     auto newfile   = spec[0];
235                     auto newstream = spec[1];
236                     pushSource(newstream, nullable(newfile));
237                 }
238                 raw = getToken();
239             }
240         }
241         // Maybe we got EOF instead?
242         while (eof == raw) {
243             if (filestack.empty)
244                 return eof;
245             else {
246                 popSource();
247                 raw = getToken();
248             }
249         }
250         // Neither inclusion nor EOF
251         if (debug_ >= 1) {
252             if (eof != raw)
253                 writeln("shlex: token=" ~ raw);
254             else
255                 writeln("shlex: token=EOF");
256         }
257         return raw;
258     }
259 
260     int opApply(scope int delegate(ref string) dg) {
261         int result = 0;
262         while (true) {
263             auto r = getToken();
264             if (r.isNull) break;
265             result = dg(r.get);
266             if (result) break;
267         }
268         return result;
269     }
270 
271     // TODO: Use empty string for None?
272     Nullable!string readToken() {
273         bool quoted = false;
274         dchar escapedstate = ' '; // TODO: use an enum
275         while (true) {
276             if(debug_ >= 3) {
277                 write("Iteration ");
278                 dump();
279             }
280             Nullable!dchar nextchar;
281             if (!punctuationChars.empty && !_pushbackChars.empty) {
282                 nextchar = _pushbackChars.back;
283                 _pushbackChars.removeBack();
284             } else {
285                 if (!instream.empty) {
286                     nextchar = instream.front;
287                     instream.popFront();
288                 }
289             }
290             if (nextchar == '\n')
291                 ++lineno;
292             if (debug_ >= 3)
293                 writeln("shlex: in state %s I see character: %s".format(state, nextchar));
294             if (state.isNull) {
295                 // TODO: Debugger shows that this is never reached. Is this code needed?
296                 token = "";        // past end of file
297                 break;
298             } else if (state == ' ') {
299                 if (nextchar.isNull) {
300                     state.nullify();  // end of file
301                     break;
302                 } else if (nextchar.get in whitespace) {
303                     if (debug_ >= 2)
304                         writeln("shlex: I see whitespace in whitespace state");
305                     if ((token && !token.empty) || (posix && quoted))
306                         break;   // emit current token
307                     else
308                         continue;
309                 } else if (nextchar.get in commenters) {
310                     instream.skipLine();
311                     ++lineno;
312                 } else if (posix && nextchar.get in escape) {
313                     escapedstate = 'a';
314                     state = nextchar;
315                 } else if (nextchar.get in wordchars) {
316                     token = [nextchar.get].toUTF8;
317                     state = 'a';
318                 } else if (nextchar.get in punctuationChars) {
319                     token = [nextchar.get].toUTF8;
320                     state = 'c';
321                 } else if (nextchar.get in quotes) {
322                     if (!posix) token = [nextchar.get].toUTF8;
323                     state = nextchar;
324                 } else if (whitespaceSplit) {
325                     token = [nextchar.get].toUTF8;
326                     state = 'a';
327                 } else {
328                     token = [nextchar.get].toUTF8;
329                     if (!token.empty || (posix && quoted))
330                         break;   // emit current token
331                     else
332                         continue;
333                 }
334             } else if (!state.isNull && state in quotes) {
335                 quoted = true;
336                 if (nextchar.isNull) {      // end of file
337                     if (debug_ >= 2)
338                         writeln("shlex: I see EOF in quotes state");
339                     // XXX what error should be raised here?
340                     throw new Exception("No closing quotation");
341                 }
342                 if (nextchar == state) {
343                     if (!posix) {
344                         token ~= nextchar;
345                         state = ' ';
346                         break;
347                     } else
348                         state = 'a';
349                 } else if (posix && !nextchar.isNull && nextchar.get in escape &&
350                         !state.isNull && state.get in escapedquotes) {
351                     escapedstate = state;
352                     state = nextchar;
353                 } else
354                     token ~= nextchar;
355             } else if (!state.isNull && state in escape) {
356                 if (nextchar.isNull) {      // end of file
357                     if (debug_ >= 2)
358                         writeln("shlex: I see EOF in escape state");
359                     // XXX what error should be raised here?
360                     throw new Exception("No escaped character");
361                 }
362                 // In posix shells, only the quote itself or the escape
363                 // character may be escaped within quotes.
364                 if (escapedstate in quotes && nextchar != state && nextchar != escapedstate)
365                     token ~= state;
366                 token ~= nextchar;
367                 state = escapedstate;
368             } else if (!state.isNull && (state.get == 'a' || state.get == 'c')) {
369                 if (nextchar.isNull) {
370                     state.nullify();   // end of file
371                     break;
372                 } else if (nextchar.get in whitespace) {
373                     if (debug_ >= 2)
374                         writeln("shlex: I see whitespace in word state");
375                     state = ' ';
376                     if (token || (posix && quoted))
377                         break;   // emit current token
378                     else
379                         continue;
380                 } else if (nextchar.get in commenters) {
381                     instream.skipLine();
382                     ++lineno;
383                     if (posix) {
384                         state = ' ';
385                         if (!token.empty || (posix && quoted))
386                             break;   // emit current token
387                         else
388                             continue;
389                     }
390                 } else if (state == 'c') {
391                     if (nextchar.get in punctuationChars)
392                         token ~= nextchar;
393                     else {
394                         if (!nextchar.get in whitespace)
395                             _pushbackChars.insertBack(nextchar);
396                         state = ' ';
397                         break;
398                     }
399                 } else if (posix && nextchar.get in quotes)
400                     state = nextchar;
401                 else if (posix && nextchar.get in escape) {
402                     escapedstate = 'a';
403                     state = nextchar;
404                 } else if (nextchar.get in wordchars || nextchar.get in quotes || whitespaceSplit) {
405                     token ~= nextchar;
406                 } else {
407                     if (punctuationChars.empty)
408                         pushback.insertFront(nextchar.get.to!string);
409                     else
410                         _pushbackChars.insertBack(nextchar);
411                     if (debug_ >= 2)
412                         writeln("shlex: I see punctuation in word state");
413                     state = ' ';
414                     if (!token.empty || (posix && quoted))
415                         break;   // emit current token
416                     else
417                         continue;
418                 }
419             }
420         }
421         Nullable!string result = token;
422         //writeln('['~token~']');
423         token = "";
424         if (posix && !quoted && result == "")
425             result.nullify();
426         if (debug_ > 1) {
427             if (!result.isNull && !result.empty) // TODO: can simplify?
428                 writeln("shlex: raw token=" ~ result);
429             else
430                 writeln("shlex: raw token=EOF");
431         }
432         return result;
433     }
434 
435     /** Hook called on a filename to be sourced.*/
436     auto sourcehook(string newfile) {
437         if (newfile[0] == '"')
438             newfile = newfile[1..$-1];
439         // This implements cpp-like semantics for relative-path inclusion.
440         if (!isAbsolute(newfile))
441             newfile = buildPath(dirName(infile), newfile);
442         return tuple(newfile, new ShlexFile(newfile));
443     }
444 
445     /** Emit a C-compiler-like, Emacs-friendly error-message leader. */
446     string errorLeader(Nullable!string infile = Nullable!string.init,
447                         Nullable!uint lineno=Nullable!uint.init)
448     {
449         if (infile.isNull)
450             infile = this.infile;
451         if (lineno.isNull)
452             lineno = this.lineno;
453         return "\"%s\", line %d: ".format(infile, lineno);
454     }
455 }
456 
457 mixin StructParams!("ShlexParams",
458                     ShlexStream, "instream",
459                     Nullable!string, "infile",
460                     Shlex.Posix, "posix",
461                     Shlex.PunctuationChars, "punctuationCharsFlag",
462                     bool, "whitespaceSplit");
463 private ShlexParams.WithDefaults shlexDefaults = { infile: Nullable!string.init,
464                                                    posix: No.posix,
465                                                    punctuationCharsFlag: No.PunctuationChars,
466                                                    whitespaceSplit: false };
467 alias ShlexProvider = ProviderWithDefaults!(Callable!(
468     (ShlexStream instream, Nullable!string infile,
469      Shlex.Posix posix,
470      Shlex.PunctuationChars punctuationCharsFlag,
471      bool whitespaceSplit) => new Shlex(instream, infile, posix, punctuationCharsFlag, whitespaceSplit)),
472     ShlexParams, shlexDefaults);
473 
474 template ShlexProviderStream(Stream) {
475     mixin StructParams!("ShlexParams",
476                         Stream, "instream",
477                         Nullable!string, "infile",
478                         Shlex.Posix, "posix",
479                         Shlex.PunctuationChars, "punctuationCharsFlag",
480                         bool, "whitespaceSplit");
481     private ShlexParams.WithDefaults shlexDefaults = { infile: Nullable!string.init,
482                                                        posix: No.posix,
483                                                        punctuationCharsFlag: No.PunctuationChars,
484                                                        whitespaceSplit: false };
485     alias ShlexProvider = ProviderWithDefaults!(Callable!(
486         (Stream instream, Nullable!string infile,
487          Shlex.Posix posix,
488          Shlex.PunctuationChars punctuationCharsFlag,
489         bool whitespaceSplit) => new Shlex(instream, infile, posix, punctuationCharsFlag, whitespaceSplit)),
490     ShlexParams, shlexDefaults);
491 }
492 
493 // TODO: Use dependency injection.
494 string[] split(string s, Shlex.Comments comments = No.comments, Shlex.Posix posix = Yes.posix) {
495     scope Shlex lex = Shlex(s, Nullable!string.init, posix); // TODO: shorten
496     lex.whitespaceSplit = true;
497     if (!comments)
498         lex.commenters.clear();
499     return lex.array;
500 }
501 
502 unittest {
503     import core.sys.posix.sys.resource;
504     auto limit = rlimit(100*1000000, 100*1000000);
505     setrlimit(RLIMIT_AS, &limit); // prevent OS crash due out of memory
506 
507     assert(split("") == []);
508     assert(split("l") == ["l"]);
509     assert(split("ls") == ["ls"]);
510     assert(split("ls -l 'somefile; ls -xz ~'") == ["ls", "-l", "somefile; ls -xz ~"]);
511     assert(split("ssh home 'somefile; ls -xz ~'") == ["ssh", "home", "somefile; ls -xz ~"]);
512 }
513 
514 private immutable _findUnsafe = regex(r"[^[a-zA-Z0-9]@%+=:,./-]");
515 
516 /** Return a shell-escaped version of the string *s*. */
517 string quote(string s) {
518     if (s.empty)
519         return "''";
520     if (!matchFirst(s, _findUnsafe))
521         return s;
522 
523     // use single quotes, and put single quotes into double quotes
524     // the string $'b is then quoted as '$'"'"'b'
525     return '\'' ~ s.replace("'", "'\"'\"'") ~ '\'';
526 }
527 
528 unittest {
529     assert(quote("") == "''");
530     assert(quote("somefile; ls -xz ~") == "'somefile; ls -xz ~'");
531     writeln(quote("'") == "''\"'\"''"); // TODO: Too long result (as inherited from the Python library)
532 }
533 
534 void _printTokens(Shlex lexer) {
535     while (true) {
536         Nullable!string tt = lexer.getToken();
537         if (tt.isNull || tt.empty) break; // TODO: can simplify?
538         writeln("Token: " ~ tt);
539     }
540 }
541