tokenizer.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154

/** \file tokenizer.h 

    A specialized tokenizer for tokenizing the fish language. In the
    future, the tokenizer should be extended to support marks,
    tokenizing multiple strings and disposing of unused string
    segments.
*/

#ifndef FISH_TOKENIZER_H
#define FISH_TOKENIZER_H

#include <wchar.h>

/**
   Token types
*/
enum token_type
{
	TOK_NONE, /**< Tokenizer not yet constructed */
	TOK_ERROR, /**< Error reading token */
	TOK_INVALID,/**< Invalid token */
	TOK_STRING,/**< String token */
	TOK_PIPE,/**< Pipe token */
	TOK_END,/**< End token */
	TOK_REDIRECT_OUT, /**< redirection token */
	TOK_REDIRECT_APPEND,/**< redirection append token */
	TOK_REDIRECT_IN,/**< input redirection token */
	TOK_REDIRECT_FD,/**< redirection to new fd token */
	TOK_BACKGROUND,/**< send job to bg token */
	TOK_COMMENT/**< comment token */
}
;

/**
   Flag telling the tokenizer to accept incomplete parameters,
   i.e. parameters with mismatching paranthesis, etc. This is useful
   for tab-completion.
*/
#define TOK_ACCEPT_UNFINISHED 1

/**
   Flag telling the tokenizer not to remove comments. Useful for
   syntax highlighting.
*/
#define TOK_SHOW_COMMENTS 2


/**
   The tokenizer struct. 
*/
typedef struct
{
	/** A pointer into the original string, showing where the next token begins */
	wchar_t *buff;
	/** A copy of the original string */
	wchar_t *orig_buff;
	/** A pointer to the last token*/
	wchar_t *last;
	
	/** Type of last token*/
	int last_type;
	/** Length of last token*/
	int last_len;
	/** Offset of last token*/
	int last_pos;
	/** Whether there are more tokens*/
	int has_next;
	/** Whether incomplete tokens are accepted*/
	int accept_unfinished;
	/** Whether commants should be returned*/
	int show_comments;
	/** Flag set to true of the orig_buff points to an internal string that needs to be free()d when deallocating the tokenizer. */
	int free_orig;
	/** Type of last quote, can be either ' or ".*/
	wchar_t last_quote;
}
tokenizer;

/**
  Initialize the tokenizer. b is the string that is to be
  tokenized. It is not copied, and should not be freed by the caller
  until after the tokenizer is destroyed.

  \param tok The tokenizer to initialize
  \param b The string to tokenize
  \param flags Flags to the tokenizer. Setting TOK_ACCEPT_UNFINISHED will cause the tokenizer
  to accept incomplete tokens, such as a subshell without a closing
  parenthesis, as a valid token. Setting TOK_SHOW_COMMENTS will return comments as tokens
  
*/
void tok_init( tokenizer *tok, const wchar_t *b, int flags );

/**
  Jump to the next token.
*/
void tok_next( tokenizer *tok );

/**
  Returns the type of the last token. Must be one of the values in the token_type enum.
*/
int tok_last_type( tokenizer *tok );

/**
  Returns the last token string. The string should not be freed by the caller.
*/
wchar_t *tok_last( tokenizer *tok );

/**
  Returns the type of quote from the last TOK_QSTRING
*/
wchar_t tok_last_quote( tokenizer *tok );

/**
  Returns true as long as there are more tokens left
*/
int tok_has_next( tokenizer *tok );

/**
  Returns the position of the beginning of the current token in the original string
*/
int tok_get_pos( tokenizer *tok );

/**
   Destroy the tokenizer and free asociated memory
*/
void tok_destroy( tokenizer *tok );


/**
   Returns the original string to tokenizer
 */
wchar_t *tok_string( tokenizer *tok );


/**
   Returns only the first token from the specified string. This is a
   convenience function, used to retrieve the first token of a
   string. This can be useful for error messages, etc.

   The string should be freed. After use.
*/
wchar_t *tok_first( const wchar_t *str );

/**
   Move tokenizer position
*/
void tok_set_pos( tokenizer *tok, int pos );

/**
   Returns a string description of the specified token type
*/
const wchar_t *tok_get_desc( int type );

#endif