1 /*
2 shlex, simple shell-like lexical analysis library
3 Copyright (C) 2019  Victor Porton
4 
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
9 
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 GNU General Public License for more details.
14 
15 You should have received a copy of the GNU General Public License
16 along with this program.  If not, see <https://www.gnu.org/licenses/>.
17 
18 This code was a rewrite of a Python 3.7 module with the same name:
19 Copyright © 2001-2019 Python Software Foundation; All Rights Reserved
20 */
21 
22 module shlex;
23 
24 import std.typecons;
25 import std.conv;
26 import std.string;
27 import std.utf;
28 import std.regex;
29 import std.array;
30 import std.range.interfaces;
31 import std.range.primitives;
32 import std.container;
33 import std.container.dlist;
34 import std.algorithm;
35 import std.file;
36 import std.path;
37 import std.stdio : write, writeln;
38 
39 // TODO: use moveFront()/moveBack()
40 
41 alias ShlexStream = InputRange!(const dchar); // Unicode stream
42 
43 class ShlexFile : InputRange!dchar {
44     private string text;
45 
46     /// The current version reads the file entirely
47     this(string name) {
48         text = readText(name);
49     }
50 
51     override @property dchar front() {
52         return text.front;
53     }
54 
55     override dchar moveFront() {
56         return text.moveFront();
57     }
58 
59     override void popFront() {
60         return text.popFront();
61     }
62 
63     override @property bool empty() {
64         return text.empty;
65     }
66 
67     override int opApply(scope int delegate(dchar) dg) {
68         int res;
69         for (auto r = text; !r.empty; r.popFront()) {
70             res = dg(r.front);
71             if (res) break;
72         }
73         return res;
74     }
75 
76     override int opApply(scope int delegate(size_t, dchar) dg) {
77         int res;
78         size_t i = 0;
79         for (auto r = text; !r.empty; r.popFront()) {
80             res = dg(i, r.front);
81             if (res) break;
82             i++;
83         }
84         return res;
85     }
86 
87     ///
88     void close() { } // we have already read the file
89 }
90 
91 private void skipLine(ShlexStream stream) {
92     while (!stream.empty && stream.front != '\n') stream.popFront();
93     if (!stream.empty && stream.front == '\n') stream.popFront();
94 }
95 
96 /// A lexical analyzer class for simple shell-like syntaxes
97 struct Shlex {
98     alias Posix = Flag!"posix";
99     alias PunctuationChars = Flag!"PunctuationChars";
100     alias Comments = Flag!"comments";
101 
102 private:
103     // TODO: Python shlex has some of the following as public instance variables (also check visibility of member functions)
104     ShlexStream instream;
105     Nullable!string infile;
106     Posix posix;
107     Nullable!string eof; // seems not efficient
108     //bool delegate(string token) isEof;
109     auto commenters = new RedBlackTree!(immutable dchar)("#");
110     RedBlackTree!(immutable dchar) wordchars;
111     static immutable whitespace = new RedBlackTree!(immutable dchar)(" \t\r\n");
112     bool whitespaceSplit = false;
113     static immutable quotes = new RedBlackTree!(immutable dchar)("'\"");
114     static immutable escape = new RedBlackTree!(immutable dchar)("\\"); // char or string?
115     static immutable escapedquotes = new RedBlackTree!(immutable dchar)("\""); // char or string?
116     Nullable!dchar state = ' '; // a little inefficient?
117     auto pushback = DList!string(); // may be not the fastest
118     uint lineno;
119     ubyte debug_ = 0;
120     string token = "";
121     auto filestack = DList!(Tuple!(Nullable!string, ShlexStream, uint))(); // may be not the fastest
122     Nullable!string source; // TODO: Represent no source just as an empty string?
123     auto punctuationChars = new RedBlackTree!(immutable dchar)();
124     // _pushbackChars is a push back queue used by lookahead logic
125     auto _pushbackChars = DList!dchar(); // may be not the fastest
126 
127 public:
128     @disable this();
129 
130     /** We don't support implicit stdin as `instream` as in Python. */
131     this(ShlexStream instream,
132          Nullable!string infile = Nullable!string.init,
133          Posix posix = No.posix,
134          PunctuationChars punctuationCharsFlag = No.PunctuationChars)
135     {
136         this.instream = instream;
137         this.infile = infile;
138         this.posix = posix;
139         if (!posix) eof = "";
140         wordchars = new RedBlackTree!(immutable dchar)("abcdfeghijklmnopqrstuvwxyz" ~ "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_");
141         if (posix)
142             wordchars.stableInsert("ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ" ~ "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ");
143         lineno = 1;
144         if(punctuationCharsFlag)
145             this.punctuationChars.stableInsert("();<>|&");
146         if (punctuationCharsFlag) {
147             // these chars added because allowed in file names, args, wildcards
148             wordchars.stableInsert("~-./*?=");
149             // remove any punctuation chars from wordchars
150             // TODO: Isn't it better to use dstring?
151             wordchars = new RedBlackTree!(immutable dchar)(filter!(c => c !in punctuationChars)(wordchars.array));
152         }
153     }
154 
155     this(Stream)(Stream instream,
156                  Nullable!string infile = Nullable!string.init,
157                  Posix posix = No.posix,
158                  PunctuationChars punctuationChars = No.PunctuationChars)
159     {
160         import std.conv;
161         // TODO: Inefficient to convert to dstring in memory.
162         this(cast (ShlexStream)inputRangeObject(cast (const dchar[])instream.dtext), infile, posix, punctuationChars);
163     }
164 
165     void dump() {
166         if (debug_ >= 3) {
167 //            writeln("state='", state, "\' nextchar='", nextchar, "\' token='", token, '\'');
168             writeln("state='", state, "\' token='", token, '\'');
169         }
170     }
171 
172     /** Push a token onto the stack popped by the getToken method */
173     void pushToken(string tok) {
174         if (debug_ >= 1)
175             writeln("shlex: pushing token " ~ tok);
176         pushback.insertFront(tok);
177     }
178 
179     /** Push an input source onto the lexer's input source stack. */
180     void pushSource(Stream)(Stream newstream, Nullable!string newfile = Nullable!string.init) {
181         pushSource(inputRangeObject(instream), newfile);
182     }
183 
184     /** Push an input source onto the lexer's input source stack. */
185     void pushSource(ShlexStream newstream, Nullable!string newfile = Nullable!string.init) {
186         filestack.insertFront(tuple(this.infile, this.instream, this.lineno));
187         this.infile = newfile;
188         this.instream = newstream;
189         this.lineno = 1;
190         if (debug_) {
191             if (newfile.isNull)
192                 writeln("shlex: pushing to stream %s".format(this.instream));
193             else
194                 writeln("shlex: pushing to file %s".format(this.infile));
195         }
196     }
197 
198     /** Pop the input source stack. */
199     void popSource() {
200         (cast(ShlexFile)instream).close(); // a little messy
201         // use a tuple library?
202         auto t = filestack.front;
203         filestack.removeFront();
204         infile   = t[0];
205         instream = t[1];
206         lineno   = t[2];
207         if (debug_)
208             writeln("shlex: popping to %s, line %d".format(instream, lineno));
209         state = ' ';
210     }
211 
212     // TODO: Use empty string for None?
213     /** Get a token from the input stream (or from stack if it's nonempty).
214         Returns null value on eof. */
215     Nullable!string getToken() {
216         if (!pushback.empty) {
217             immutable tok = pushback.front;
218             pushback.removeFront();
219             if (debug_ >= 1)
220                 writeln("shlex: popping token " ~ tok);
221             return nullable(tok);
222         }
223         // No pushback.  Get a token.
224         Nullable!string raw = readToken();
225         // Handle inclusions
226         if (!source.isNull && !source.empty) {
227             while (raw == source) {
228                 auto spec = sourcehook(readToken());
229                 if (!spec.empty) {
230                     auto newfile   = spec[0];
231                     auto newstream = spec[1];
232                     pushSource(newstream, nullable(newfile));
233                 }
234                 raw = getToken();
235             }
236         }
237         // Maybe we got EOF instead?
238         while (eof == raw) {
239             if (filestack.empty)
240                 return eof;
241             else {
242                 popSource();
243                 raw = getToken();
244             }
245         }
246         // Neither inclusion nor EOF
247         if (debug_ >= 1) {
248             if (eof != raw)
249                 writeln("shlex: token=" ~ raw);
250             else
251                 writeln("shlex: token=EOF");
252         }
253         return raw;
254     }
255 
256     int opApply(scope int delegate(ref string) dg) {
257         int result = 0;
258         while (true) {
259             auto r = getToken();
260             if (r.isNull) break;
261             result = dg(r.get);
262             if (result) break;
263         }
264         return result;
265     }
266 
267     // TODO: Use empty string for None?
268     Nullable!string readToken() {
269         bool quoted = false;
270         dchar escapedstate = ' '; // TODO: use an enum
271         while (true) {
272             if(debug_ >= 3) {
273                 write("Iteration ");
274                 dump();
275             }
276             Nullable!dchar nextchar;
277             if (!punctuationChars.empty && !_pushbackChars.empty) {
278                 nextchar = _pushbackChars.back;
279                 _pushbackChars.removeBack();
280             } else {
281                 if (!instream.empty) {
282                     nextchar = instream.front;
283                     instream.popFront();
284                 }
285             }
286             if (nextchar == '\n')
287                 ++lineno;
288             if (debug_ >= 3)
289                 writeln("shlex: in state %s I see character: %s".format(state, nextchar));
290             if (state.isNull) {
291                 // TODO: Debugger shows that this is never reached. Is this code needed?
292                 token = "";        // past end of file
293                 break;
294             } else if (state == ' ') {
295                 if (nextchar.isNull) {
296                     state.nullify();  // end of file
297                     break;
298                 } else if (nextchar.get in whitespace) {
299                     if (debug_ >= 2)
300                         writeln("shlex: I see whitespace in whitespace state");
301                     if ((token && !token.empty) || (posix && quoted))
302                         break;   // emit current token
303                     else
304                         continue;
305                 } else if (nextchar.get in commenters) {
306                     instream.skipLine();
307                     ++lineno;
308                 } else if (posix && nextchar.get in escape) {
309                     escapedstate = 'a';
310                     state = nextchar;
311                 } else if (nextchar.get in wordchars) {
312                     token = [nextchar.get].toUTF8;
313                     state = 'a';
314                 } else if (nextchar.get in punctuationChars) {
315                     token = [nextchar.get].toUTF8;
316                     state = 'c';
317                 } else if (nextchar.get in quotes) {
318                     if (!posix) token = [nextchar.get].toUTF8;
319                     state = nextchar;
320                 } else if (whitespaceSplit) {
321                     token = [nextchar.get].toUTF8;
322                     state = 'a';
323                 } else {
324                     token = [nextchar.get].toUTF8;
325                     if (!token.empty || (posix && quoted))
326                         break;   // emit current token
327                     else
328                         continue;
329                 }
330             } else if (!state.isNull && state in quotes) {
331                 quoted = true;
332                 if (nextchar.isNull) {      // end of file
333                     if (debug_ >= 2)
334                         writeln("shlex: I see EOF in quotes state");
335                     // XXX what error should be raised here?
336                     throw new Exception("No closing quotation");
337                 }
338                 if (nextchar == state) {
339                     if (!posix) {
340                         token ~= nextchar;
341                         state = ' ';
342                         break;
343                     } else
344                         state = 'a';
345                 } else if (posix && !nextchar.isNull && nextchar.get in escape &&
346                         !state.isNull && state.get in escapedquotes) {
347                     escapedstate = state;
348                     state = nextchar;
349                 } else
350                     token ~= nextchar;
351             } else if (!state.isNull && state in escape) {
352                 if (nextchar.isNull) {      // end of file
353                     if (debug_ >= 2)
354                         writeln("shlex: I see EOF in escape state");
355                     // XXX what error should be raised here?
356                     throw new Exception("No escaped character");
357                 }
358                 // In posix shells, only the quote itself or the escape
359                 // character may be escaped within quotes.
360                 if (escapedstate in quotes && nextchar != state && nextchar != escapedstate)
361                     token ~= state;
362                 token ~= nextchar;
363                 state = escapedstate;
364             } else if (!state.isNull && (state.get == 'a' || state.get == 'c')) {
365                 if (nextchar.isNull) {
366                     state.nullify();   // end of file
367                     break;
368                 } else if (nextchar.get in whitespace) {
369                     if (debug_ >= 2)
370                         writeln("shlex: I see whitespace in word state");
371                     state = ' ';
372                     if (token || (posix && quoted))
373                         break;   // emit current token
374                     else
375                         continue;
376                 } else if (nextchar.get in commenters) {
377                     instream.skipLine();
378                     ++lineno;
379                     if (posix) {
380                         state = ' ';
381                         if (!token.empty || (posix && quoted))
382                             break;   // emit current token
383                         else
384                             continue;
385                     }
386                 } else if (state == 'c') {
387                     if (nextchar.get in punctuationChars)
388                         token ~= nextchar;
389                     else {
390                         if (!nextchar.get in whitespace)
391                             _pushbackChars.insertBack(nextchar);
392                         state = ' ';
393                         break;
394                     }
395                 } else if (posix && nextchar.get in quotes)
396                     state = nextchar;
397                 else if (posix && nextchar.get in escape) {
398                     escapedstate = 'a';
399                     state = nextchar;
400                 } else if (nextchar.get in wordchars || nextchar.get in quotes || whitespaceSplit) {
401                     token ~= nextchar;
402                 } else {
403                     if (punctuationChars.empty)
404                         pushback.insertFront(nextchar.get.to!string);
405                     else
406                         _pushbackChars.insertBack(nextchar);
407                     if (debug_ >= 2)
408                         writeln("shlex: I see punctuation in word state");
409                     state = ' ';
410                     if (!token.empty || (posix && quoted))
411                         break;   // emit current token
412                     else
413                         continue;
414                 }
415             }
416         }
417         Nullable!string result = token;
418         //writeln('['~token~']');
419         token = "";
420         if (posix && !quoted && result == "")
421             result.nullify();
422         if (debug_ > 1) {
423             if (!result.isNull && !result.empty) // TODO: can simplify?
424                 writeln("shlex: raw token=" ~ result);
425             else
426                 writeln("shlex: raw token=EOF");
427         }
428         return result;
429     }
430 
431     /** Hook called on a filename to be sourced.*/
432     auto sourcehook(string newfile) {
433         if (newfile[0] == '"')
434             newfile = newfile[1..$-1];
435         // This implements cpp-like semantics for relative-path inclusion.
436         if (!isAbsolute(newfile))
437             newfile = buildPath(dirName(infile), newfile);
438         return tuple(newfile, new ShlexFile(newfile));
439     }
440 
441     /** Emit a C-compiler-like, Emacs-friendly error-message leader. */
442     string errorLeader(Nullable!string infile = Nullable!string.init,
443                         Nullable!uint lineno=Nullable!uint.init)
444     {
445         if (infile.isNull)
446             infile = this.infile;
447         if (lineno.isNull)
448             lineno = this.lineno;
449         return "\"%s\", line %d: ".format(infile, lineno);
450     }
451 }
452 
453 string[] split(string s, Shlex.Comments comments = No.comments, Shlex.Posix posix = Yes.posix) {
454     scope Shlex lex = Shlex(s, Nullable!string.init, posix); // TODO: shorten
455     lex.whitespaceSplit = true;
456     if (!comments)
457         lex.commenters.clear();
458     return lex.array;
459 }
460 
461 unittest {
462     import core.sys.posix.sys.resource;
463     auto limit = rlimit(100*1000000, 100*1000000);
464     setrlimit(RLIMIT_AS, &limit); // prevent OS crash due out of memory
465 
466     assert(split("") == []);
467     assert(split("l") == ["l"]);
468     assert(split("ls") == ["ls"]);
469     assert(split("ls -l 'somefile; ls -xz ~'") == ["ls", "-l", "somefile; ls -xz ~"]);
470     assert(split("ssh home 'somefile; ls -xz ~'") == ["ssh", "home", "somefile; ls -xz ~"]);
471 }
472 
473 private immutable _findUnsafe = regex(r"[^[a-zA-Z0-9]@%+=:,./-]");
474 
475 /** Return a shell-escaped version of the string *s*. */
476 string quote(string s) {
477     if (s.empty)
478         return "''";
479     if (!matchFirst(s, _findUnsafe))
480         return s;
481 
482     // use single quotes, and put single quotes into double quotes
483     // the string $'b is then quoted as '$'"'"'b'
484     return '\'' ~ s.replace("'", "'\"'\"'") ~ '\'';
485 }
486 
487 unittest {
488     assert(quote("") == "''");
489     assert(quote("somefile; ls -xz ~") == "'somefile; ls -xz ~'");
490     writeln(quote("'") == "''\"'\"''"); // TODO: Too long result (as inherited from the Python library)
491 }
492 
493 void _printTokens(Shlex lexer) {
494     while (true) {
495         Nullable!string tt = lexer.getToken();
496         if (tt.isNull || tt.empty) break; // TODO: can simplify?
497         writeln("Token: " ~ tt);
498     }
499 }
500