mirror of
https://github.com/ganelson/inform.git
synced 2024-05-22 02:48:41 +03:00
Use CH32EOF as EOF marker, allows fewer casts. Also put generated C code back to using wchar_t
This commit is contained in:
parent
bde03297ac
commit
81a92aef9f
|
@ -166,14 +166,15 @@ alone, and the version number is returned.
|
|||
by the local |\n| for good measure.
|
||||
|
||||
@<Read the titling line of the extension and normalise its casing@> =
|
||||
int c, commented_out = FALSE, quoted = FALSE, content_found = FALSE;
|
||||
while ((c = TextFiles::utf8_fgetc(EXTF, NULL, NULL)) != EOF) {
|
||||
inchar32_t c;
|
||||
int commented_out = FALSE, quoted = FALSE, content_found = FALSE;
|
||||
while ((c = TextFiles::utf8_fgetc(EXTF, NULL, NULL)) != CH32EOF) {
|
||||
if (c == 0xFEFF) continue; /* skip the optional Unicode BOM pseudo-character */
|
||||
if (commented_out) {
|
||||
if (c == ']') commented_out = FALSE;
|
||||
} else if (quoted) {
|
||||
if (c == '"') quoted = FALSE;
|
||||
PUT_TO(titling_line, (inchar32_t) c);
|
||||
PUT_TO(titling_line, c);
|
||||
} else {
|
||||
if (c == '[') commented_out = TRUE;
|
||||
else {
|
||||
|
@ -181,10 +182,10 @@ by the local |\n| for good measure.
|
|||
else if ((c == '\x0a') || (c == '\x0d') || (c == '\n')) {
|
||||
if (content_found) break;
|
||||
c = ' ';
|
||||
} else if (Characters::is_whitespace((inchar32_t) c) == FALSE) {
|
||||
} else if (Characters::is_whitespace(c) == FALSE) {
|
||||
content_found = TRUE;
|
||||
}
|
||||
PUT_TO(titling_line, (inchar32_t) c);
|
||||
PUT_TO(titling_line, c);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -205,15 +206,16 @@ halfway through a line division combination like |0A 0D|, so that the first
|
|||
thing we read here is a meaningless |0D|.
|
||||
|
||||
@<Read the rubric text, if any is present@> =
|
||||
int c, found_start = FALSE;
|
||||
while ((c = TextFiles::utf8_fgetc(EXTF, NULL, NULL)) != EOF) {
|
||||
inchar32_t c;
|
||||
int found_start = FALSE;
|
||||
while ((c = TextFiles::utf8_fgetc(EXTF, NULL, NULL)) != CH32EOF) {
|
||||
if ((c == '\x0a') || (c == '\x0d') || (c == '\n') || (c == '\t')) c = ' ';
|
||||
if ((c != ' ') && (found_start == FALSE)) {
|
||||
if (c == '"') found_start = TRUE;
|
||||
else break;
|
||||
} else {
|
||||
if (c == '"') break;
|
||||
if (found_start) PUT_TO(E->rubric_as_lexed, (inchar32_t) c);
|
||||
if (found_start) PUT_TO(E->rubric_as_lexed, c);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1193,34 +1193,35 @@ text |bibliographic_sentence| and |in French| to the text |bracketed|. If not,
|
|||
the whole thing goes into |bibliographic_sentence| and |bracketed| is empty.
|
||||
|
||||
@<Capture the opening sentence and its bracketed part@> =
|
||||
int c, commented = FALSE, quoted = FALSE, rounded = FALSE, content_found = FALSE;
|
||||
while ((c = TextFiles::utf8_fgetc(SF, NULL, NULL)) != EOF) {
|
||||
inchar32_t c;
|
||||
int commented = FALSE, quoted = FALSE, rounded = FALSE, content_found = FALSE;
|
||||
while ((c = TextFiles::utf8_fgetc(SF, NULL, NULL)) != CH32EOF) {
|
||||
if (c == 0xFEFF) continue; /* skip the optional Unicode BOM pseudo-character */
|
||||
if (commented) {
|
||||
if (c == ']') commented = FALSE;
|
||||
} else {
|
||||
if (quoted) {
|
||||
if (rounded) PUT_TO(bracketed, (inchar32_t) c);
|
||||
else PUT_TO(bibliographic_sentence, (inchar32_t) c);
|
||||
if (rounded) PUT_TO(bracketed, c);
|
||||
else PUT_TO(bibliographic_sentence, c);
|
||||
if (c == '"') quoted = FALSE;
|
||||
} else {
|
||||
if (c == '[') commented = TRUE;
|
||||
else {
|
||||
if (Characters::is_whitespace((inchar32_t) c) == FALSE) content_found = TRUE;
|
||||
if (Characters::is_whitespace(c) == FALSE) content_found = TRUE;
|
||||
if (rounded) {
|
||||
if (c == '"') quoted = TRUE;
|
||||
if ((c == '\x0a') || (c == '\x0d') || (c == '\n')) c = ' ';
|
||||
if (c == ')') rounded = FALSE;
|
||||
else PUT_TO(bracketed, (inchar32_t) c);
|
||||
else PUT_TO(bracketed, c);
|
||||
} else {
|
||||
if (c == '(') rounded = TRUE;
|
||||
else {
|
||||
if ((c == '\x0a') || (c == '\x0d') || (c == '\n')) {
|
||||
if (content_found) break;
|
||||
c = ' ';
|
||||
PUT_TO(bibliographic_sentence, (inchar32_t) c);
|
||||
PUT_TO(bibliographic_sentence, c);
|
||||
} else {
|
||||
PUT_TO(bibliographic_sentence, (inchar32_t) c);
|
||||
PUT_TO(bibliographic_sentence, c);
|
||||
}
|
||||
if (c == '"') quoted = TRUE;
|
||||
}
|
||||
|
|
|
@ -79,7 +79,8 @@ perform them in the "wrong" order, what should the compiler do?
|
|||
=
|
||||
text_stream *Interventions::expand_bracket_plus(text_stream *S) {
|
||||
text_stream *OUT = Str::new();
|
||||
int col = 1, cr, sfp = 0;
|
||||
int col = 1, sfp = 0;
|
||||
inchar32_t cr;
|
||||
TEMPORARY_TEXT(heading_name)
|
||||
TEMPORARY_TEXT(command)
|
||||
TEMPORARY_TEXT(argument)
|
||||
|
@ -87,7 +88,7 @@ text_stream *Interventions::expand_bracket_plus(text_stream *S) {
|
|||
Str::clear(command);
|
||||
Str::clear(argument);
|
||||
@<Read next character@>;
|
||||
NewCharacter: if (cr == EOF) break;
|
||||
NewCharacter: if (cr == CH32EOF) break;
|
||||
if (cr == '{') {
|
||||
@<Read next character@>;
|
||||
if (cr == '-') {
|
||||
|
@ -110,8 +111,8 @@ text_stream *Interventions::expand_bracket_plus(text_stream *S) {
|
|||
goto NewCharacter;
|
||||
}
|
||||
}
|
||||
if (OUT) PUT_TO(OUT, (inchar32_t) cr);
|
||||
} while (cr != EOF);
|
||||
if (OUT) PUT_TO(OUT, cr);
|
||||
} while (cr != CH32EOF);
|
||||
DISCARD_TEXT(command)
|
||||
DISCARD_TEXT(argument)
|
||||
DISCARD_TEXT(heading_name)
|
||||
|
@ -119,7 +120,7 @@ text_stream *Interventions::expand_bracket_plus(text_stream *S) {
|
|||
}
|
||||
|
||||
@<Read next character@> =
|
||||
cr = (int) Str::get_at(S, sfp); if (cr == 0) cr = EOF; else sfp++;
|
||||
cr = Str::get_at(S, sfp); if (cr == 0) cr = CH32EOF; else sfp++;
|
||||
col++; if ((cr == 10) || (cr == 13)) col = 0;
|
||||
|
||||
@ Our biggest complication is that I7 expressions can be included in the I6
|
||||
|
@ -139,10 +140,10 @@ which can trigger an unwanted |(+|.
|
|||
TEMPORARY_TEXT(i7_exp)
|
||||
while (TRUE) {
|
||||
@<Read next character@>;
|
||||
if (cr == EOF) break;
|
||||
if (cr == CH32EOF) break;
|
||||
if ((cr == ')') && (Str::get_last_char(i7_exp) == '+')) {
|
||||
Str::delete_last_character(i7_exp); break; }
|
||||
PUT_TO(i7_exp, (inchar32_t) cr);
|
||||
PUT_TO(i7_exp, cr);
|
||||
}
|
||||
wording W = Feeds::feed_text(i7_exp);
|
||||
CSIInline::eval_bracket_plus_to_text(OUT, W);
|
||||
|
@ -154,10 +155,10 @@ which can trigger an unwanted |(+|.
|
|||
int com_mode = TRUE;
|
||||
while (TRUE) {
|
||||
@<Read next character@>;
|
||||
if ((cr == '}') || (cr == EOF)) break;
|
||||
if ((cr == '}') || (cr == CH32EOF)) break;
|
||||
if ((cr == ':') && (com_mode)) { com_mode = FALSE; continue; }
|
||||
if (com_mode) PUT_TO(command, (inchar32_t) cr);
|
||||
else PUT_TO(argument, (inchar32_t) cr);
|
||||
if (com_mode) PUT_TO(command, cr);
|
||||
else PUT_TO(argument, cr);
|
||||
}
|
||||
|
||||
@<Act on I6T command and argument@> =
|
||||
|
|
|
@ -224,7 +224,7 @@ typedef struct i7_mg_file_t {
|
|||
typedef struct i7_mg_stream_t {
|
||||
FILE *to_file;
|
||||
i7word_t to_file_id;
|
||||
inchar32_t *to_memory;
|
||||
wchar_t *to_memory;
|
||||
size_t memory_used;
|
||||
size_t memory_capacity;
|
||||
i7word_t previous_id;
|
||||
|
@ -620,12 +620,12 @@ void i7_miniglk_stream_set_current(i7process_t *proc, i7word_t id) {
|
|||
characters are written to. The following implements |glk_put_char_stream|.
|
||||
|
||||
= (text to inform7_clib.h)
|
||||
void i7_mg_put_to_stream(i7process_t *proc, i7word_t rock, inchar32_t c);
|
||||
void i7_mg_put_to_stream(i7process_t *proc, i7word_t rock, wchar_t c);
|
||||
void i7_miniglk_put_char_stream(i7process_t *proc, i7word_t stream_id, i7word_t x);
|
||||
=
|
||||
|
||||
= (text to inform7_clib.c)
|
||||
void i7_mg_put_to_stream(i7process_t *proc, i7word_t rock, inchar32_t c) {
|
||||
void i7_mg_put_to_stream(i7process_t *proc, i7word_t rock, wchar_t c) {
|
||||
i7_mg_stream_t *S =
|
||||
&(proc->miniglk->memory_streams[proc->state.current_output_stream_ID]);
|
||||
if (proc->receiver == NULL) fputc(c, stdout);
|
||||
|
@ -665,7 +665,7 @@ void i7_miniglk_put_char_stream(i7process_t *proc, i7word_t stream_id, i7word_t
|
|||
if (S->memory_used >= S->memory_capacity) {
|
||||
size_t needed = 4*S->memory_capacity;
|
||||
if (needed == 0) needed = 1024;
|
||||
inchar32_t *new_data = (inchar32_t *) calloc(needed, sizeof(inchar32_t));
|
||||
wchar_t *new_data = (wchar_t *) calloc(needed, sizeof(wchar_t));
|
||||
if (new_data == NULL) {
|
||||
fprintf(stderr, "Out of memory\n"); i7_fatal_exit(proc);
|
||||
}
|
||||
|
@ -673,7 +673,7 @@ void i7_miniglk_put_char_stream(i7process_t *proc, i7word_t stream_id, i7word_t
|
|||
free(S->to_memory);
|
||||
S->to_memory = new_data;
|
||||
}
|
||||
S->to_memory[S->memory_used++] = (inchar32_t) x;
|
||||
S->to_memory[S->memory_used++] = (wchar_t) x;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -880,7 +880,7 @@ i7word_t i7_miniglk_request_line_event(i7process_t *proc, i7word_t window_id,
|
|||
e.win_id = window_id;
|
||||
e.val1 = 1;
|
||||
e.val2 = 0;
|
||||
inchar32_t c; int pos = init_len;
|
||||
wchar_t c; int pos = init_len;
|
||||
if (proc->sender == NULL) i7_benign_exit(proc);
|
||||
char *s = (proc->sender)(proc->send_count++);
|
||||
int i = 0;
|
||||
|
@ -907,7 +907,7 @@ i7word_t i7_miniglk_request_line_event_uni(i7process_t *proc, i7word_t window_id
|
|||
e.win_id = window_id;
|
||||
e.val1 = 1;
|
||||
e.val2 = 0;
|
||||
inchar32_t c; int pos = init_len;
|
||||
wchar_t c; int pos = init_len;
|
||||
if (proc->sender == NULL) i7_benign_exit(proc);
|
||||
char *s = (proc->sender)(proc->send_count++);
|
||||
int i = 0;
|
||||
|
|
|
@ -418,7 +418,7 @@ typedef struct i7process_t {
|
|||
int snapshot_pos;
|
||||
jmp_buf execution_env;
|
||||
int termination_code;
|
||||
void (*receiver)(int id, inchar32_t c, char *style);
|
||||
void (*receiver)(int id, wchar_t c, char *style);
|
||||
int send_count;
|
||||
char *(*sender)(int count);
|
||||
void (*stylist)(struct i7process_t *proc, i7word_t which, i7word_t what);
|
||||
|
@ -479,7 +479,7 @@ a new process, so we must define those:
|
|||
|
||||
= (text to inform7_clib.h)
|
||||
char *i7_default_sender(int count);
|
||||
void i7_default_receiver(int id, inchar32_t c, char *style);
|
||||
void i7_default_receiver(int id, wchar_t c, char *style);
|
||||
=
|
||||
|
||||
The receiver and sender functions allow our textual I/O to be managed by external
|
||||
|
@ -493,7 +493,7 @@ The sender supplies us with textual commands. By default, it takes a typed (or
|
|||
of course piped) single line of text from the C |stdin| stream.
|
||||
|
||||
= (text to inform7_clib.c)
|
||||
void i7_default_receiver(int id, inchar32_t c, char *style) {
|
||||
void i7_default_receiver(int id, wchar_t c, char *style) {
|
||||
if (id == I7_BODY_TEXT_ID) fputc(c, stdout);
|
||||
}
|
||||
|
||||
|
@ -537,13 +537,13 @@ but may in between the two supply its own receiver or sender:
|
|||
|
||||
= (text to inform7_clib.h)
|
||||
void i7_set_process_receiver(i7process_t *proc,
|
||||
void (*receiver)(int id, inchar32_t c, char *style), int UTF8);
|
||||
void (*receiver)(int id, wchar_t c, char *style), int UTF8);
|
||||
void i7_set_process_sender(i7process_t *proc, char *(*sender)(int count));
|
||||
=
|
||||
|
||||
= (text to inform7_clib.c)
|
||||
void i7_set_process_receiver(i7process_t *proc,
|
||||
void (*receiver)(int id, inchar32_t c, char *style), int UTF8) {
|
||||
void (*receiver)(int id, wchar_t c, char *style), int UTF8) {
|
||||
proc->receiver = receiver;
|
||||
proc->use_UTF8 = UTF8;
|
||||
}
|
||||
|
|
|
@ -296,8 +296,8 @@ void DocReferences::doc_fragment_to(OUTPUT_STREAM, text_stream *fn) {
|
|||
int i = 0;
|
||||
p[0] = 0;
|
||||
while (TRUE) {
|
||||
int c = TextFiles::utf8_fgetc(FRAGMENTS, NULL, NULL);
|
||||
if (c == EOF) break;
|
||||
inchar32_t c = TextFiles::utf8_fgetc(FRAGMENTS, NULL, NULL);
|
||||
if (c == CH32EOF) break;
|
||||
if (c == 0xFEFF) continue; /* the Unicode BOM non-character */
|
||||
if (i == MAX_EXTENT_OF_FRAGMENTS) break;
|
||||
p[i++] = (char) c;
|
||||
|
|
|
@ -87,26 +87,26 @@ int Localisation::stock_from_file(filename *localisation_file, localisation_dict
|
|||
}
|
||||
int col = 1, line = 1, nwsol = FALSE; /* "non white space on line" */
|
||||
unicode_file_buffer ufb = TextFiles::create_ufb();
|
||||
int cr; /* note that on some platforms |inchar32_t| is unable to hold |EOF| */
|
||||
inchar32_t cr;
|
||||
TEMPORARY_TEXT(key)
|
||||
TEMPORARY_TEXT(value)
|
||||
do {
|
||||
@<Read next character@>;
|
||||
if (cr == EOF) break;
|
||||
if (cr == CH32EOF) break;
|
||||
if ((cr == '#') && (nwsol == FALSE)) @<Read up to end of line as a comment@>
|
||||
else if ((cr == '%') && (nwsol == FALSE)) @<Read up to the next white space as a key@>
|
||||
else if (Characters::is_whitespace((inchar32_t) cr) == FALSE) nwsol = TRUE;
|
||||
if (cr == EOF) break;
|
||||
else if (Characters::is_whitespace(cr) == FALSE) nwsol = TRUE;
|
||||
if (cr == CH32EOF) break;
|
||||
if (Str::len(key) > 0) {
|
||||
if ((Characters::is_whitespace((inchar32_t) cr) == FALSE) || (Str::len(value) > 0))
|
||||
PUT_TO(value, (inchar32_t) cr);
|
||||
if ((Characters::is_whitespace(cr) == FALSE) || (Str::len(value) > 0))
|
||||
PUT_TO(value, cr);
|
||||
} else {
|
||||
if (Characters::is_whitespace((inchar32_t) cr) == FALSE) {
|
||||
if (Characters::is_whitespace(cr) == FALSE) {
|
||||
Localisation::error(localisation_file, line, col,
|
||||
I"extraneous matter appears before first %key");
|
||||
}
|
||||
}
|
||||
} while (cr != EOF);
|
||||
} while (cr != CH32EOF);
|
||||
if (Str::len(key) > 0) @<Write key-value pair@>;
|
||||
DISCARD_TEXT(key)
|
||||
DISCARD_TEXT(value)
|
||||
|
@ -125,13 +125,13 @@ int Localisation::stock_from_file(filename *localisation_file, localisation_dict
|
|||
Str::clear(value);
|
||||
while (TRUE) {
|
||||
@<Read next character@>;
|
||||
if ((cr == '=') || (cr == EOF)) break;
|
||||
if (Characters::is_whitespace((inchar32_t) cr) == FALSE) PUT_TO(key, (inchar32_t) cr);
|
||||
if ((cr == '=') || (cr == CH32EOF)) break;
|
||||
if (Characters::is_whitespace(cr) == FALSE) PUT_TO(key, cr);
|
||||
}
|
||||
if (cr == '=') {
|
||||
while (TRUE) {
|
||||
@<Read next character@>;
|
||||
if (Characters::is_whitespace((inchar32_t) cr)) continue;
|
||||
if (Characters::is_whitespace(cr)) continue;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -195,7 +195,7 @@ But we want to restore the more natural spacing.
|
|||
|
||||
@<Restore inter-word spaces unless this would be unnatural@> =
|
||||
if ((i>from)
|
||||
&& ((p[1] != 0) || (Lexer::is_punctuation((int) p[0]) == FALSE) ||
|
||||
&& ((p[1] != 0) || (Lexer::is_punctuation(p[0]) == FALSE) ||
|
||||
(p[0] == '(') || (p[0] == '{') || (p[0] == '}'))
|
||||
&& (compare_word(i-1, OPENBRACKET_V)==FALSE))
|
||||
PasteButtons::put_code_char(OUT, ' ');
|
||||
|
|
|
@ -57,10 +57,10 @@ wording Feeds::feed_C_string_full(inchar32_t *text, int expand, inchar32_t *nons
|
|||
@<Set up the lexer@>;
|
||||
lexer_break_at_slashes = break_at_slashes;
|
||||
for (int i=0; text[i] != 0; i++) {
|
||||
int last_cr, cr, next_cr;
|
||||
if (i > 0) last_cr = (int) text[i-1]; else last_cr = EOF;
|
||||
cr = (int) text[i];
|
||||
if (cr != 0) next_cr = (int) text[i+1]; else next_cr = EOF;
|
||||
inchar32_t last_cr, cr, next_cr;
|
||||
if (i > 0) last_cr = text[i-1]; else last_cr = CH32EOF;
|
||||
cr = text[i];
|
||||
if (cr != 0) next_cr = text[i+1]; else next_cr = CH32EOF;
|
||||
Lexer::feed_triplet(last_cr, cr, next_cr);
|
||||
}
|
||||
@<Extract results from the lexer@>;
|
||||
|
@ -69,10 +69,10 @@ wording Feeds::feed_C_string_full(inchar32_t *text, int expand, inchar32_t *nons
|
|||
wording Feeds::feed_text_full(text_stream *text, int expand, inchar32_t *nonstandard) {
|
||||
@<Set up the lexer@>;
|
||||
for (int i=0, L=Str::len(text); i<L; i++) {
|
||||
int last_cr, cr, next_cr;
|
||||
if (i > 0) last_cr = (int) Str::get_at(text, i-1); else last_cr = EOF;
|
||||
cr = (int) Str::get_at(text, i);
|
||||
if (cr != 0) next_cr = (int) Str::get_at(text, i+1); else next_cr = EOF;
|
||||
inchar32_t last_cr, cr, next_cr;
|
||||
if (i > 0) last_cr = Str::get_at(text, i-1); else last_cr = CH32EOF;
|
||||
cr = Str::get_at(text, i);
|
||||
if (cr != 0) next_cr = Str::get_at(text, i+1); else next_cr = CH32EOF;
|
||||
Lexer::feed_triplet(last_cr, cr, next_cr);
|
||||
}
|
||||
@<Extract results from the lexer@>;
|
||||
|
|
|
@ -91,7 +91,7 @@ characters cause word divisions, or signal literals.
|
|||
@d INFORM6_ESCAPE_END_1 '-'
|
||||
@d INFORM6_ESCAPE_END_2 ')'
|
||||
@d PARAGRAPH_BREAK U"|__" /* Inserted as a special word to mark paragraph breaks */
|
||||
@d UNICODE_CHAR_IN_STRING ((inchar32_t) 0x1b) /* To represent awkward characters in metadata only */
|
||||
@d UNICODE_CHAR_IN_STRING 0x1bu /* To represent awkward characters in metadata only */
|
||||
|
||||
@ This is the standard set used for parsing source text.
|
||||
|
||||
|
@ -374,9 +374,9 @@ As we have seen, the question of whether something is a punctuation mark
|
|||
or not depends slightly on the context:
|
||||
|
||||
=
|
||||
int Lexer::is_punctuation(int c) {
|
||||
int Lexer::is_punctuation(inchar32_t c) {
|
||||
for (int i=0; lexer_punctuation_marks[i]; i++)
|
||||
if (c == (int) lexer_punctuation_marks[i])
|
||||
if (c == lexer_punctuation_marks[i])
|
||||
return TRUE;
|
||||
return FALSE;
|
||||
}
|
||||
|
@ -490,12 +490,10 @@ The current situation of the lexer is specified by the collective values
|
|||
of all of the following. First, the start of the current word being
|
||||
recorded, and the current high water mark -- those are defined above.
|
||||
Second, we need the feeder machinery to maintain a variable telling us
|
||||
the previous character in the raw, un-respaced source. We need to be a
|
||||
little careful about the type of this: it needs to be an |int| so that it
|
||||
can on occasion hold the pseudo-character value |EOF|.
|
||||
the previous character in the raw, un-respaced source.
|
||||
|
||||
=
|
||||
int lxs_previous_char_in_raw_feed; /* Preceding character in raw file read */
|
||||
inchar32_t lxs_previous_char_in_raw_feed; /* Preceding character in raw file read */
|
||||
|
||||
@ There are four kinds of word: ordinary words, [comments in square brackets],
|
||||
"strings in double quotes," and |(- I6_inclusion_text -)|. The latter
|
||||
|
@ -541,7 +539,7 @@ always being "off").
|
|||
=
|
||||
void Lexer::reset_lexer(void) {
|
||||
lexer_word = lexer_hwm;
|
||||
lxs_previous_char_in_raw_feed = EOF;
|
||||
lxs_previous_char_in_raw_feed = CH32EOF;
|
||||
|
||||
/* reset the external states */
|
||||
lexer_wait_for_dashes = FALSE;
|
||||
|
@ -673,9 +671,9 @@ int Lexer::detect_tear_off(void) {
|
|||
}
|
||||
|
||||
@ The feeder routine is required to send us a triple each time: |cr|
|
||||
must be a valid character (see above) and may not be |EOF|; |last_cr| must
|
||||
be the previous one or else perhaps |EOF| at the start of feed;
|
||||
while |next_cr| must be the next or else perhaps |EOF| at the end of feed.
|
||||
must be a valid character (see above) and may not be |CH32EOF|; |last_cr| must
|
||||
be the previous one or else perhaps |CH32EOF| at the start of feed;
|
||||
while |next_cr| must be the next or else perhaps |CH32EOF| at the end of feed.
|
||||
|
||||
Spaces, often redundant, are inserted around punctuation unless one of the
|
||||
following exceptions holds:
|
||||
|
@ -697,7 +695,7 @@ Where the character following is a slash. (This is done essentially to make
|
|||
most common URLs glue up as single words.)
|
||||
|
||||
=
|
||||
void Lexer::feed_triplet(int last_cr, int cr, int next_cr) {
|
||||
void Lexer::feed_triplet(inchar32_t last_cr, inchar32_t cr, inchar32_t next_cr) {
|
||||
lxs_previous_char_in_raw_feed = last_cr;
|
||||
int space = FALSE;
|
||||
if (Lexer::is_punctuation(cr)) space = TRUE;
|
||||
|
@ -706,9 +704,9 @@ void Lexer::feed_triplet(int last_cr, int cr, int next_cr) {
|
|||
if (next_cr == '/') space = FALSE;
|
||||
else {
|
||||
int lc = 0, nc = 0;
|
||||
if (Characters::isdigit((inchar32_t) last_cr)) lc = 1;
|
||||
if (Characters::isdigit(last_cr)) lc = 1;
|
||||
if ((last_cr >= 'a') && (last_cr <= 'z')) lc = 2;
|
||||
if (Characters::isdigit((inchar32_t) next_cr)) nc = 1;
|
||||
if (Characters::isdigit(next_cr)) nc = 1;
|
||||
if (next_cr == '-') nc = 1;
|
||||
if ((next_cr >= 'a') && (next_cr <= 'z')) nc = 2;
|
||||
if ((lc == 1) && (nc == 1)) space = FALSE;
|
||||
|
@ -754,7 +752,7 @@ surviving marbles is the sequence of characters starting at |lexer_word| and
|
|||
extending to |lexer_hwm-1|.
|
||||
|
||||
=
|
||||
void Lexer::feed_char_into_lexer(int c) {
|
||||
void Lexer::feed_char_into_lexer(inchar32_t c) {
|
||||
Lexer::ensure_lexer_hwm_can_be_raised_by(MAX_WORD_LENGTH, TRUE);
|
||||
|
||||
if (lxs_literal_mode) {
|
||||
|
@ -774,7 +772,7 @@ void Lexer::feed_char_into_lexer(int c) {
|
|||
}
|
||||
|
||||
/* otherwise record the current character as part of the word being built */
|
||||
*(lexer_hwm++) = (inchar32_t) c;
|
||||
*(lexer_hwm++) = c;
|
||||
|
||||
if (lxs_scanning_text_substitution) {
|
||||
@<Force string division at the end of a text substitution, if necessary@>;
|
||||
|
@ -834,7 +832,7 @@ discarded. A paragraph break is converted into a special "divider" word.
|
|||
@<Line break outside a literal@> =
|
||||
if (lxs_this_line_is_empty_so_far) {
|
||||
for (int i=0; PARAGRAPH_BREAK[i]; i++)
|
||||
Lexer::feed_char_into_lexer((int) PARAGRAPH_BREAK[i]);
|
||||
Lexer::feed_char_into_lexer(PARAGRAPH_BREAK[i]);
|
||||
Lexer::feed_char_into_lexer(' ');
|
||||
}
|
||||
lxs_this_line_is_empty_so_far = TRUE;
|
||||
|
@ -852,7 +850,7 @@ Inform print a paragraph break at run-time.
|
|||
@<Soak up whitespace around line breaks inside a literal string@> =
|
||||
if (lxs_string_soak_up_spaces_mode) {
|
||||
switch(c) {
|
||||
case ' ': case '\t': c = (int) *(lexer_hwm-1); lexer_hwm--; break;
|
||||
case ' ': case '\t': c = *(lexer_hwm-1); lexer_hwm--; break;
|
||||
case '\n':
|
||||
*(lexer_hwm-1) = NEWLINE_IN_STRING;
|
||||
c = NEWLINE_IN_STRING;
|
||||
|
@ -1028,7 +1026,7 @@ finished.
|
|||
case STRING_KW:
|
||||
if (c == STRING_END) {
|
||||
lxs_string_soak_up_spaces_mode = FALSE;
|
||||
*(lexer_hwm++) = (inchar32_t) c; /* record the |STRING_END| character as part of the word */
|
||||
*(lexer_hwm++) = c; /* record the |STRING_END| character as part of the word */
|
||||
lxs_literal_mode = FALSE;
|
||||
}
|
||||
break;
|
||||
|
|
|
@ -45,7 +45,8 @@ source_file *TextFromFiles::feed_open_file_into_lexer(filename *F, FILE *handle,
|
|||
sf->body_text = Str::new();
|
||||
sf->torn_off_documentation = Str::new();
|
||||
source_location top_of_file;
|
||||
int cr, last_cr, next_cr, read_cr, newline_char = 0, torn_off = FALSE;
|
||||
inchar32_t cr, last_cr, next_cr, read_cr, newline_char = 0;
|
||||
int torn_off = FALSE;
|
||||
|
||||
unicode_file_buffer ufb = TextFiles::create_filtered_ufb(mode);
|
||||
|
||||
|
@ -57,8 +58,8 @@ source_file *TextFromFiles::feed_open_file_into_lexer(filename *F, FILE *handle,
|
|||
|
||||
last_cr = ' '; cr = ' '; next_cr = TextFiles::utf8_fgetc(sf->handle, NULL, &ufb);
|
||||
if (next_cr == 0xFEFF) next_cr = TextFiles::utf8_fgetc(sf->handle, NULL, &ufb); /* Unicode BOM code */
|
||||
if (next_cr != EOF)
|
||||
while (((read_cr = TextFiles::utf8_fgetc(sf->handle, NULL, &ufb)), next_cr) != EOF) {
|
||||
if (next_cr != CH32EOF)
|
||||
while (((read_cr = TextFiles::utf8_fgetc(sf->handle, NULL, &ufb)), next_cr) != CH32EOF) {
|
||||
last_cr = cr; cr = next_cr; next_cr = read_cr;
|
||||
switch(cr) {
|
||||
case '\x0a':
|
||||
|
@ -78,9 +79,9 @@ source_file *TextFromFiles::feed_open_file_into_lexer(filename *F, FILE *handle,
|
|||
break;
|
||||
}
|
||||
if (torn_off) {
|
||||
PUT_TO(sf->torn_off_documentation, (inchar32_t) cr);
|
||||
PUT_TO(sf->torn_off_documentation, cr);
|
||||
} else {
|
||||
PUT_TO(sf->body_text, (inchar32_t) cr);
|
||||
PUT_TO(sf->body_text, cr);
|
||||
Lexer::feed_triplet(last_cr, cr, next_cr);
|
||||
torn_off = Lexer::detect_tear_off();
|
||||
}
|
||||
|
@ -157,7 +158,7 @@ int TextFromFiles::word_count(int wc) {
|
|||
/* outside quoted text, each lexer word not wholly composed of punctuation scores 1 */
|
||||
if (Lexer::word(wc) != PARBREAK_V)
|
||||
for (; *p != 0; p++)
|
||||
if ((Lexer::is_punctuation((int) *p) == FALSE) && (*p != '|')) {
|
||||
if ((Lexer::is_punctuation(*p) == FALSE) && (*p != '|')) {
|
||||
N++;
|
||||
break;
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue