/************************************************* * Perl-Compatible Regular Expressions * *************************************************/ /* PCRE is a library of functions to support regular expressions whose syntax and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge New API code Copyright (c) 2016-2024 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the University of Cambridge nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ----------------------------------------------------------------------------- */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include "pcre2_compile.h" typedef struct { /* Option bits for eclass. */ uint32_t options; uint32_t xoptions; /* Rarely used members. */ int *errorcodeptr; compile_block *cb; /* Bitmap is needed. */ BOOL needs_bitmap; } eclass_context; /* Checks the allowed tokens at the end of a class structure in debug mode. When a new token is not processed by all loops, and the token is equals to a) one of the cases here: the compiler will complain about a duplicated case value. b) none of the cases here: the loop without the handler will stop with an assertion failure. */ #ifdef PCRE2_DEBUG #define CLASS_END_CASES(meta) \ default: \ PCRE2_ASSERT((meta) <= META_END); \ /* Fall through */ \ case META_CLASS: \ case META_CLASS_NOT: \ case META_CLASS_EMPTY: \ case META_CLASS_EMPTY_NOT: \ case META_CLASS_END: \ case META_ECLASS_AND: \ case META_ECLASS_OR: \ case META_ECLASS_SUB: \ case META_ECLASS_XOR: \ case META_ECLASS_NOT: #else #define CLASS_END_CASES(meta) \ default: #endif #ifdef SUPPORT_WIDE_CHARS /* Heapsort algorithm. */ static void do_heapify(uint32_t *buffer, size_t size, size_t i) { size_t max; size_t left; size_t right; uint32_t tmp1, tmp2; while (TRUE) { max = i; left = (i << 1) + 2; right = left + 2; if (left < size && buffer[left] > buffer[max]) max = left; if (right < size && buffer[right] > buffer[max]) max = right; if (i == max) return; /* Swap items. */ tmp1 = buffer[i]; tmp2 = buffer[i + 1]; buffer[i] = buffer[max]; buffer[i + 1] = buffer[max + 1]; buffer[max] = tmp1; buffer[max + 1] = tmp2; i = max; } } #ifdef SUPPORT_UNICODE #define PARSE_CLASS_UTF 0x1 #define PARSE_CLASS_CASELESS_UTF 0x2 #define PARSE_CLASS_RESTRICTED_UTF 0x4 #define PARSE_CLASS_TURKISH_UTF 0x8 /* Get the range of nocase characters which includes the 'c' character passed as argument, or directly follows 'c'. */ static const uint32_t* get_nocase_range(uint32_t c) { uint32_t left = 0; uint32_t right = PRIV(ucd_nocase_ranges_size); uint32_t middle; if (c > MAX_UTF_CODE_POINT) return PRIV(ucd_nocase_ranges) + right; while (TRUE) { /* Range end of the middle element. */ middle = ((left + right) >> 1) | 0x1; if (PRIV(ucd_nocase_ranges)[middle] <= c) left = middle + 1; else if (middle > 1 && PRIV(ucd_nocase_ranges)[middle - 2] > c) right = middle - 1; else return PRIV(ucd_nocase_ranges) + (middle - 1); } } /* Get the list of othercase characters, which belongs to the passed range. Create ranges from these characters, and append them to the buffer argument. */ static size_t utf_caseless_extend(uint32_t start, uint32_t end, uint32_t options, uint32_t *buffer) { uint32_t new_start = start; uint32_t new_end = end; uint32_t c = start; const uint32_t *list; uint32_t tmp[3]; size_t result = 2; const uint32_t *skip_range = get_nocase_range(c); uint32_t skip_start = skip_range[0]; #if PCRE2_CODE_UNIT_WIDTH == 8 PCRE2_ASSERT(options & PARSE_CLASS_UTF); #endif #if PCRE2_CODE_UNIT_WIDTH == 32 if (end > MAX_UTF_CODE_POINT) end = MAX_UTF_CODE_POINT; #endif while (c <= end) { uint32_t co; if (c > skip_start) { c = skip_range[1]; skip_range += 2; skip_start = skip_range[0]; continue; } /* Compute caseless set. */ if ((options & (PARSE_CLASS_TURKISH_UTF|PARSE_CLASS_RESTRICTED_UTF)) == PARSE_CLASS_TURKISH_UTF && UCD_ANY_I(c)) { co = PRIV(ucd_turkish_dotted_i_caseset) + (UCD_DOTTED_I(c)? 0 : 3); } else if ((co = UCD_CASESET(c)) != 0 && (options & PARSE_CLASS_RESTRICTED_UTF) != 0 && PRIV(ucd_caseless_sets)[co] < 128) { co = 0; /* Ignore the caseless set if it's restricted. */ } if (co != 0) list = PRIV(ucd_caseless_sets) + co; else { co = UCD_OTHERCASE(c); list = tmp; tmp[0] = c; tmp[1] = NOTACHAR; if (co != c) { tmp[1] = co; tmp[2] = NOTACHAR; } } c++; /* Add characters. */ do { #if PCRE2_CODE_UNIT_WIDTH == 16 if (!(options & PARSE_CLASS_UTF) && *list > 0xffff) continue; #endif if (*list < new_start) { if (*list + 1 == new_start) { new_start--; continue; } } else if (*list > new_end) { if (*list - 1 == new_end) { new_end++; continue; } } else continue; result += 2; if (buffer != NULL) { buffer[0] = *list; buffer[1] = *list; buffer += 2; } } while (*(++list) != NOTACHAR); } if (buffer != NULL) { buffer[0] = new_start; buffer[1] = new_end; buffer += 2; (void)buffer; } return result; } #endif /* Add a character list to a buffer. */ static size_t append_char_list(const uint32_t *p, uint32_t *buffer) { const uint32_t *n; size_t result = 0; while (*p != NOTACHAR) { n = p; while (n[0] == n[1] - 1) n++; PCRE2_ASSERT(*p < 0xffff); if (buffer != NULL) { buffer[0] = *p; buffer[1] = *n; buffer += 2; } result += 2; p = n + 1; } return result; } static uint32_t get_highest_char(uint32_t options) { (void)options; /* Avoid compiler warning. */ #if PCRE2_CODE_UNIT_WIDTH == 8 return MAX_UTF_CODE_POINT; #else #ifdef SUPPORT_UNICODE return GET_MAX_CHAR_VALUE((options & PARSE_CLASS_UTF) != 0); #else return MAX_UCHAR_VALUE; #endif #endif } /* Add a negated character list to a buffer. */ static size_t append_negated_char_list(const uint32_t *p, uint32_t options, uint32_t *buffer) { const uint32_t *n; uint32_t start = 0; size_t result = 2; PCRE2_ASSERT(*p > 0); while (*p != NOTACHAR) { n = p; while (n[0] == n[1] - 1) n++; PCRE2_ASSERT(*p < 0xffff); if (buffer != NULL) { buffer[0] = start; buffer[1] = *p - 1; buffer += 2; } result += 2; start = *n + 1; p = n + 1; } if (buffer != NULL) { buffer[0] = start; buffer[1] = get_highest_char(options); buffer += 2; (void)buffer; } return result; } static uint32_t * append_non_ascii_range(uint32_t options, uint32_t *buffer) { if (buffer == NULL) return NULL; buffer[0] = 0x100; buffer[1] = get_highest_char(options); return buffer + 2; } static size_t parse_class(uint32_t *ptr, uint32_t options, uint32_t *buffer) { size_t total_size = 0; size_t size; uint32_t meta_arg; uint32_t start_char; while (TRUE) { switch (META_CODE(*ptr)) { case META_ESCAPE: meta_arg = META_DATA(*ptr); switch (meta_arg) { case ESC_D: case ESC_W: case ESC_S: buffer = append_non_ascii_range(options, buffer); total_size += 2; break; case ESC_h: size = append_char_list(PRIV(hspace_list), buffer); total_size += size; if (buffer != NULL) buffer += size; break; case ESC_H: size = append_negated_char_list(PRIV(hspace_list), options, buffer); total_size += size; if (buffer != NULL) buffer += size; break; case ESC_v: size = append_char_list(PRIV(vspace_list), buffer); total_size += size; if (buffer != NULL) buffer += size; break; case ESC_V: size = append_negated_char_list(PRIV(vspace_list), options, buffer); total_size += size; if (buffer != NULL) buffer += size; break; case ESC_p: case ESC_P: ptr++; if (meta_arg == ESC_p && (*ptr >> 16) == PT_ANY) { if (buffer != NULL) { buffer[0] = 0; buffer[1] = get_highest_char(options); buffer += 2; } total_size += 2; } break; } ptr++; continue; case META_POSIX_NEG: buffer = append_non_ascii_range(options, buffer); total_size += 2; ptr += 2; continue; case META_POSIX: ptr += 2; continue; case META_BIGVALUE: /* Character literal */ ptr++; break; CLASS_END_CASES(*ptr) if (*ptr >= META_END) return total_size; break; } start_char = *ptr; if (ptr[1] == META_RANGE_LITERAL || ptr[1] == META_RANGE_ESCAPED) { ptr += 2; PCRE2_ASSERT(*ptr < META_END || *ptr == META_BIGVALUE); if (*ptr == META_BIGVALUE) ptr++; #ifdef EBCDIC #error "Missing EBCDIC support" #endif } #ifdef SUPPORT_UNICODE if (options & PARSE_CLASS_CASELESS_UTF) { size = utf_caseless_extend(start_char, *ptr++, options, buffer); if (buffer != NULL) buffer += size; total_size += size; continue; } #endif if (buffer != NULL) { buffer[0] = start_char; buffer[1] = *ptr; buffer += 2; } ptr++; total_size += 2; } return total_size; } /* Extra uint32_t values for storing the lengths of range lists in the worst case. Two uint32_t lengths and a range end for a range starting before 255 */ #define CHAR_LIST_EXTRA_SIZE 3 /* Starting character values for each character list. */ static const uint32_t char_list_starts[] = { #if PCRE2_CODE_UNIT_WIDTH == 32 XCL_CHAR_LIST_HIGH_32_START, #endif #if PCRE2_CODE_UNIT_WIDTH == 32 || defined SUPPORT_UNICODE XCL_CHAR_LIST_LOW_32_START, #endif XCL_CHAR_LIST_HIGH_16_START, /* Must be terminated by XCL_CHAR_LIST_LOW_16_START, which also represents the end of the bitset. */ XCL_CHAR_LIST_LOW_16_START, }; static class_ranges * compile_optimize_class(uint32_t *start_ptr, uint32_t options, uint32_t xoptions, compile_block *cb) { class_ranges* cranges; uint32_t *ptr; uint32_t *buffer; uint32_t *dst; uint32_t class_options = 0; size_t range_list_size = 0, total_size, i; uint32_t tmp1, tmp2; const uint32_t *char_list_next; uint16_t *next_char; uint32_t char_list_start, char_list_end; uint32_t range_start, range_end; #ifdef SUPPORT_UNICODE if (options & PCRE2_UTF) class_options |= PARSE_CLASS_UTF; if ((options & PCRE2_CASELESS) && (options & (PCRE2_UTF|PCRE2_UCP))) class_options |= PARSE_CLASS_CASELESS_UTF; if (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) class_options |= PARSE_CLASS_RESTRICTED_UTF; if (xoptions & PCRE2_EXTRA_TURKISH_CASING) class_options |= PARSE_CLASS_TURKISH_UTF; #endif /* Compute required space for the range. */ range_list_size = parse_class(start_ptr, class_options, NULL); PCRE2_ASSERT((range_list_size & 0x1) == 0); /* Allocate buffer. The total_size also represents the end of the buffer. */ total_size = range_list_size + ((range_list_size >= 2) ? CHAR_LIST_EXTRA_SIZE : 0); cranges = cb->cx->memctl.malloc( sizeof(class_ranges) + total_size * sizeof(uint32_t), cb->cx->memctl.memory_data); if (cranges == NULL) return NULL; cranges->next = NULL; cranges->range_list_size = (uint16_t)range_list_size; cranges->char_lists_types = 0; cranges->char_lists_size = 0; cranges->char_lists_start = 0; if (range_list_size == 0) return cranges; buffer = (uint32_t*)(cranges + 1); parse_class(start_ptr, class_options, buffer); /* Using <= instead of == to help static analysis. */ if (range_list_size <= 2) return cranges; /* In-place sorting of ranges. */ i = (((range_list_size >> 2) - 1) << 1); while (TRUE) { do_heapify(buffer, range_list_size, i); if (i == 0) break; i -= 2; } i = range_list_size - 2; while (TRUE) { tmp1 = buffer[i]; tmp2 = buffer[i + 1]; buffer[i] = buffer[0]; buffer[i + 1] = buffer[1]; buffer[0] = tmp1; buffer[1] = tmp2; do_heapify(buffer, i, 0); if (i == 0) break; i -= 2; } /* Merge ranges whenever possible. */ dst = buffer; ptr = buffer + 2; range_list_size -= 2; /* The second condition is a very rare corner case, where the end of the last range is the maximum character. This range cannot be extended further. */ while (range_list_size > 0 && dst[1] != ~(uint32_t)0) { if (dst[1] + 1 < ptr[0]) { dst += 2; dst[0] = ptr[0]; dst[1] = ptr[1]; } else if (dst[1] < ptr[1]) dst[1] = ptr[1]; ptr += 2; range_list_size -= 2; } PCRE2_ASSERT(dst[1] <= get_highest_char(class_options)); /* When the number of ranges are less than six, they are not converted to range lists. */ ptr = buffer; while (ptr < dst && ptr[1] < 0x100) ptr += 2; if (dst - ptr < (2 * (6 - 1))) { cranges->range_list_size = (uint16_t)(dst + 2 - buffer); return cranges; } /* Compute character lists structures. */ char_list_next = char_list_starts; char_list_start = *char_list_next++; #if PCRE2_CODE_UNIT_WIDTH == 32 char_list_end = XCL_CHAR_LIST_HIGH_32_END; #elif defined SUPPORT_UNICODE char_list_end = XCL_CHAR_LIST_LOW_32_END; #else char_list_end = XCL_CHAR_LIST_HIGH_16_END; #endif next_char = (uint16_t*)(buffer + total_size); tmp1 = 0; tmp2 = ((sizeof(char_list_starts) / sizeof(uint32_t)) - 1) * XCL_TYPE_BIT_LEN; PCRE2_ASSERT(tmp2 <= 3 * XCL_TYPE_BIT_LEN && tmp2 >= XCL_TYPE_BIT_LEN); range_start = dst[0]; range_end = dst[1]; while (TRUE) { if (range_start >= char_list_start) { if (range_start == range_end || range_end < char_list_end) { tmp1++; next_char--; if (char_list_start < XCL_CHAR_LIST_LOW_32_START) *next_char = (uint16_t)((range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END); else *(uint32_t*)(--next_char) = (range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END; } if (range_start < range_end) { if (range_start > char_list_start) { tmp1++; next_char--; if (char_list_start < XCL_CHAR_LIST_LOW_32_START) *next_char = (uint16_t)(range_start << XCL_CHAR_SHIFT); else *(uint32_t*)(--next_char) = (range_start << XCL_CHAR_SHIFT); } else cranges->char_lists_types |= XCL_BEGIN_WITH_RANGE << tmp2; } PCRE2_ASSERT((uint32_t*)next_char >= dst + 2); if (dst > buffer) { dst -= 2; range_start = dst[0]; range_end = dst[1]; continue; } range_start = 0; range_end = 0; } if (range_end >= char_list_start) { PCRE2_ASSERT(range_start < char_list_start); if (range_end < char_list_end) { tmp1++; next_char--; if (char_list_start < XCL_CHAR_LIST_LOW_32_START) *next_char = (uint16_t)((range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END); else *(uint32_t*)(--next_char) = (range_end << XCL_CHAR_SHIFT) | XCL_CHAR_END; PCRE2_ASSERT((uint32_t*)next_char >= dst + 2); } cranges->char_lists_types |= XCL_BEGIN_WITH_RANGE << tmp2; } if (tmp1 >= XCL_ITEM_COUNT_MASK) { cranges->char_lists_types |= XCL_ITEM_COUNT_MASK << tmp2; next_char--; if (char_list_start < XCL_CHAR_LIST_LOW_32_START) *next_char = (uint16_t)tmp1; else *(uint32_t*)(--next_char) = tmp1; } else cranges->char_lists_types |= tmp1 << tmp2; if (range_start < XCL_CHAR_LIST_LOW_16_START) break; PCRE2_ASSERT(tmp2 >= XCL_TYPE_BIT_LEN); char_list_end = char_list_start - 1; char_list_start = *char_list_next++; tmp1 = 0; tmp2 -= XCL_TYPE_BIT_LEN; } if (dst[0] < XCL_CHAR_LIST_LOW_16_START) dst += 2; PCRE2_ASSERT((uint16_t*)dst <= next_char); cranges->char_lists_size = (size_t)((uint8_t*)(buffer + total_size) - (uint8_t*)next_char); cranges->char_lists_start = (size_t)((uint8_t*)next_char - (uint8_t*)buffer); cranges->range_list_size = (uint16_t)(dst - buffer); return cranges; } #endif /* SUPPORT_WIDE_CHARS */ #ifdef SUPPORT_UNICODE void PRIV(update_classbits)(uint32_t ptype, uint32_t pdata, BOOL negated, uint8_t *classbits) { /* Update PRIV(xclass) when this function is changed. */ int c, chartype; const ucd_record *prop; uint32_t gentype; BOOL set_bit; if (ptype == PT_ANY) { if (!negated) memset(classbits, 0xff, 32); return; } for (c = 0; c < 256; c++) { prop = GET_UCD(c); set_bit = FALSE; (void)set_bit; switch (ptype) { case PT_LAMP: chartype = prop->chartype; set_bit = (chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt); break; case PT_GC: set_bit = (PRIV(ucp_gentype)[prop->chartype] == pdata); break; case PT_PC: set_bit = (prop->chartype == pdata); break; case PT_SC: set_bit = (prop->script == pdata); break; case PT_SCX: set_bit = (prop->script == pdata || MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), pdata) != 0); break; case PT_ALNUM: gentype = PRIV(ucp_gentype)[prop->chartype]; set_bit = (gentype == ucp_L || gentype == ucp_N); break; case PT_SPACE: /* Perl space */ case PT_PXSPACE: /* POSIX space */ switch(c) { HSPACE_BYTE_CASES: VSPACE_BYTE_CASES: set_bit = TRUE; break; default: set_bit = (PRIV(ucp_gentype)[prop->chartype] == ucp_Z); break; } break; case PT_WORD: chartype = prop->chartype; gentype = PRIV(ucp_gentype)[chartype]; set_bit = (gentype == ucp_L || gentype == ucp_N || chartype == ucp_Mn || chartype == ucp_Pc); break; case PT_UCNC: set_bit = (c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT || c == CHAR_GRAVE_ACCENT || c >= 0xa0); break; case PT_BIDICL: set_bit = (UCD_BIDICLASS_PROP(prop) == pdata); break; case PT_BOOL: set_bit = MAPBIT(PRIV(ucd_boolprop_sets) + UCD_BPROPS_PROP(prop), pdata) != 0; break; case PT_PXGRAPH: chartype = prop->chartype; gentype = PRIV(ucp_gentype)[chartype]; set_bit = (gentype != ucp_Z && (gentype != ucp_C || chartype == ucp_Cf)); break; case PT_PXPRINT: chartype = prop->chartype; set_bit = (chartype != ucp_Zl && chartype != ucp_Zp && (PRIV(ucp_gentype)[chartype] != ucp_C || chartype == ucp_Cf)); break; case PT_PXPUNCT: gentype = PRIV(ucp_gentype)[prop->chartype]; set_bit = (gentype == ucp_P || (c < 128 && gentype == ucp_S)); break; default: PCRE2_ASSERT(ptype == PT_PXXDIGIT); set_bit = (c >= CHAR_0 && c <= CHAR_9) || (c >= CHAR_A && c <= CHAR_F) || (c >= CHAR_a && c <= CHAR_f); break; } if (negated) set_bit = !set_bit; if (set_bit) *classbits |= (uint8_t)(1 << (c & 0x7)); if ((c & 0x7) == 0x7) classbits++; } } #endif /* SUPPORT_UNICODE */ #ifdef SUPPORT_WIDE_CHARS /************************************************* * XClass related properties * *************************************************/ /* XClass needs to be generated. */ #define XCLASS_REQUIRED 0x1 /* XClass has 8 bit character. */ #define XCLASS_HAS_8BIT_CHARS 0x2 /* XClass has properties. */ #define XCLASS_HAS_PROPS 0x4 /* XClass has character lists. */ #define XCLASS_HAS_CHAR_LISTS 0x8 /* XClass matches to all >= 256 characters. */ #define XCLASS_HIGH_ANY 0x10 #endif /************************************************* * Internal entry point for add range to class * *************************************************/ /* This function sets the overall range for characters < 256. It also handles non-utf case folding. Arguments: options the options bits xoptions the extra options bits cb compile data start start of range character end end of range character Returns: cb->classbits is updated */ static void add_to_class(uint32_t options, uint32_t xoptions, compile_block *cb, uint32_t start, uint32_t end) { uint8_t *classbits = cb->classbits.classbits; uint32_t c, byte_start, byte_end; uint32_t classbits_end = (end <= 0xff ? end : 0xff); /* If caseless matching is required, scan the range and process alternate cases. In Unicode, there are 8-bit characters that have alternate cases that are greater than 255 and vice-versa (though these may be ignored if caseless restriction is in force). Sometimes we can just extend the original range. */ if ((options & PCRE2_CASELESS) != 0) { #ifdef SUPPORT_UNICODE /* UTF mode. This branch is taken if we don't support wide characters (e.g. 8-bit library, without UTF), but we do treat those characters as Unicode (if UCP flag is set). In this case, we only need to expand the character class set to include the case pairs which are in the 0-255 codepoint range. */ if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0) { BOOL turkish_i = (xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) == PCRE2_EXTRA_TURKISH_CASING; if (start < 128) { uint32_t lo_end = (classbits_end < 127 ? classbits_end : 127); for (c = start; c <= lo_end; c++) { if (turkish_i && UCD_ANY_I(c)) continue; SETBIT(classbits, cb->fcc[c]); } } if (classbits_end >= 128) { uint32_t hi_start = (start > 128 ? start : 128); for (c = hi_start; c <= classbits_end; c++) { uint32_t co = UCD_OTHERCASE(c); if (co <= 0xff) SETBIT(classbits, co); } } } else #endif /* SUPPORT_UNICODE */ /* Not UTF mode */ { for (c = start; c <= classbits_end; c++) SETBIT(classbits, cb->fcc[c]); } } /* Use the bitmap for characters < 256. Otherwise use extra data. */ byte_start = (start + 7) >> 3; byte_end = (classbits_end + 1) >> 3; if (byte_start >= byte_end) { for (c = start; c <= classbits_end; c++) /* Regardless of start, c will always be <= 255. */ SETBIT(classbits, c); return; } for (c = byte_start; c < byte_end; c++) classbits[c] = 0xff; byte_start <<= 3; byte_end <<= 3; for (c = start; c < byte_start; c++) SETBIT(classbits, c); for (c = byte_end; c <= classbits_end; c++) SETBIT(classbits, c); } #if PCRE2_CODE_UNIT_WIDTH == 8 /************************************************* * Internal entry point for add list to class * *************************************************/ /* This function is used for adding a list of horizontal or vertical whitespace characters to a class. The list must be in order so that ranges of characters can be detected and handled appropriately. This function sets the overall range so that the internal functions can try to avoid duplication when handling case-independence. Arguments: options the options bits xoptions the extra options bits cb contains pointers to tables etc. p points to row of 32-bit values, terminated by NOTACHAR Returns: cb->classbits is updated */ static void add_list_to_class(uint32_t options, uint32_t xoptions, compile_block *cb, const uint32_t *p) { while (p[0] < 256) { unsigned int n = 0; while(p[n+1] == p[0] + n + 1) n++; add_to_class(options, xoptions, cb, p[0], p[n]); p += n + 1; } } /************************************************* * Add characters not in a list to a class * *************************************************/ /* This function is used for adding the complement of a list of horizontal or vertical whitespace to a class. The list must be in order. Arguments: options the options bits xoptions the extra options bits cb contains pointers to tables etc. p points to row of 32-bit values, terminated by NOTACHAR Returns: cb->classbits is updated */ static void add_not_list_to_class(uint32_t options, uint32_t xoptions, compile_block *cb, const uint32_t *p) { if (p[0] > 0) add_to_class(options, xoptions, cb, 0, p[0] - 1); while (p[0] < 256) { while (p[1] == p[0] + 1) p++; add_to_class(options, xoptions, cb, p[0] + 1, (p[1] > 255) ? 255 : p[1] - 1); p++; } } #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ /************************************************* * Main entry-point to compile a character class * *************************************************/ /* This function consumes a "leaf", which is a set of characters that will become a single OP_CLASS OP_NCLASS, OP_XCLASS, or OP_ALLANY. */ uint32_t * PRIV(compile_class_not_nested)(uint32_t options, uint32_t xoptions, uint32_t *start_ptr, PCRE2_UCHAR **pcode, BOOL negate_class, BOOL* has_bitmap, int *errorcodeptr, compile_block *cb, PCRE2_SIZE *lengthptr) { uint32_t *pptr = start_ptr; PCRE2_UCHAR *code = *pcode; BOOL should_flip_negation; const uint8_t *cbits = cb->cbits; /* Some functions such as add_to_class() or eclass processing expects that the bitset is stored in cb->classbits.classbits. */ uint8_t *const classbits = cb->classbits.classbits; #ifdef SUPPORT_UNICODE BOOL utf = (options & PCRE2_UTF) != 0; #else /* No Unicode support */ BOOL utf = FALSE; #endif /* Helper variables for OP_XCLASS opcode (for characters > 255). */ #ifdef SUPPORT_WIDE_CHARS uint32_t xclass_props; PCRE2_UCHAR *class_uchardata; class_ranges* cranges; #endif /* If an XClass contains a negative special such as \S, we need to flip the negation flag at the end, so that support for characters > 255 works correctly (they are all included in the class). An XClass may need to insert specific matching or non-matching code for wide characters. */ should_flip_negation = FALSE; /* XClass will be used when characters > 255 might match. */ #ifdef SUPPORT_WIDE_CHARS xclass_props = 0; #if PCRE2_CODE_UNIT_WIDTH == 8 cranges = NULL; if (utf) #endif { if (lengthptr != NULL) { cranges = compile_optimize_class(pptr, options, xoptions, cb); if (cranges == NULL) { *errorcodeptr = ERR21; return NULL; } /* Caching the pre-processed character ranges. */ if (cb->next_cranges != NULL) cb->next_cranges->next = cranges; else cb->cranges = cranges; cb->next_cranges = cranges; } else { /* Reuse the pre-processed character ranges. */ cranges = cb->cranges; PCRE2_ASSERT(cranges != NULL); cb->cranges = cranges->next; } if (cranges->range_list_size > 0) { const uint32_t *ranges = (const uint32_t*)(cranges + 1); if (ranges[0] <= 255) xclass_props |= XCLASS_HAS_8BIT_CHARS; if (ranges[cranges->range_list_size - 1] == GET_MAX_CHAR_VALUE(utf) && ranges[cranges->range_list_size - 2] <= 256) xclass_props |= XCLASS_HIGH_ANY; } } class_uchardata = code + LINK_SIZE + 2; /* For XCLASS items */ #endif /* SUPPORT_WIDE_CHARS */ /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map in a temporary bit of memory, in case the class contains fewer than two 8-bit characters because in that case the compiled code doesn't use the bit map. */ memset(classbits, 0, 32); /* Process items until end_ptr is reached. */ while (TRUE) { uint32_t meta = *(pptr++); BOOL local_negate; int posix_class; int taboffset, tabopt; class_bits_storage pbits; uint32_t escape, c; /* Handle POSIX classes such as [:alpha:] etc. */ switch (META_CODE(meta)) { case META_POSIX: case META_POSIX_NEG: local_negate = (meta == META_POSIX_NEG); posix_class = *(pptr++); if (local_negate) should_flip_negation = TRUE; /* Note negative special */ /* If matching is caseless, upper and lower are converted to alpha. This relies on the fact that the class table starts with alpha, lower, upper as the first 3 entries. */ if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2) posix_class = 0; /* When PCRE2_UCP is set, some of the POSIX classes are converted to different escape sequences that use Unicode properties \p or \P. Others that are not available via \p or \P have to generate XCL_PROP/XCL_NOTPROP directly, which is done here. */ #ifdef SUPPORT_UNICODE /* TODO This entire block of code here appears to be unreachable!? I simply can't see how it can be hit, given that the frontend parser doesn't emit META_POSIX for GRAPH/PRINT/PUNCT when UCP is set. */ if ((options & PCRE2_UCP) != 0 && (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0) { uint32_t ptype; switch(posix_class) { case PC_GRAPH: case PC_PRINT: case PC_PUNCT: ptype = (posix_class == PC_GRAPH)? PT_PXGRAPH : (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT; PRIV(update_classbits)(ptype, 0, local_negate, classbits); if ((xclass_props & XCLASS_HIGH_ANY) == 0) { if (lengthptr != NULL) *lengthptr += 3; else { *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP; *class_uchardata++ = (PCRE2_UCHAR)ptype; *class_uchardata++ = 0; } xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS; } continue; /* For the other POSIX classes (ex: ascii) we are going to fall through to the non-UCP case and build a bit map for characters with code points less than 256. However, if we are in a negated POSIX class, characters with code points greater than 255 must either all match or all not match, depending on whether the whole class is not or is negated. For example, for [[:^ascii:]... they must all match, whereas for [^[:^ascii:]... they must not. In the special case where there are no xclass items, this is automatically handled by the use of OP_CLASS or OP_NCLASS, but an explicit range is needed for OP_XCLASS. Setting a flag here causes the range to be generated later when it is known that OP_XCLASS is required. In the 8-bit library this is relevant only in utf mode, since no wide characters can exist otherwise. */ default: break; } } #endif /* SUPPORT_UNICODE */ /* In the non-UCP case, or when UCP makes no difference, we build the bit map for the POSIX class in a chunk of local store because we may be adding and subtracting from it, and we don't want to subtract bits that may be in the main map already. At the end we or the result into the bit map that is being built. */ posix_class *= 3; /* Copy in the first table (always present) */ memcpy(pbits.classbits, cbits + PRIV(posix_class_maps)[posix_class], 32); /* If there is a second table, add or remove it as required. */ taboffset = PRIV(posix_class_maps)[posix_class + 1]; tabopt = PRIV(posix_class_maps)[posix_class + 2]; if (taboffset >= 0) { if (tabopt >= 0) for (int i = 0; i < 32; i++) pbits.classbits[i] |= cbits[i + taboffset]; else for (int i = 0; i < 32; i++) pbits.classbits[i] &= (uint8_t)(~cbits[i + taboffset]); } /* Now see if we need to remove any special characters. An option value of 1 removes vertical space and 2 removes underscore. */ if (tabopt < 0) tabopt = -tabopt; if (tabopt == 1) pbits.classbits[1] &= ~0x3c; else if (tabopt == 2) pbits.classbits[11] &= 0x7f; /* Add the POSIX table or its complement into the main table that is being built and we are done. */ { uint32_t *classwords = cb->classbits.classwords; if (local_negate) for (int i = 0; i < 8; i++) classwords[i] |= (uint32_t)(~pbits.classwords[i]); else for (int i = 0; i < 8; i++) classwords[i] |= pbits.classwords[i]; } #ifdef SUPPORT_WIDE_CHARS /* Every class contains at least one < 256 character. */ xclass_props |= XCLASS_HAS_8BIT_CHARS; #endif continue; /* End of POSIX handling */ /* Other than POSIX classes, the only items we should encounter are \d-type escapes and literal characters (possibly as ranges). */ case META_BIGVALUE: meta = *(pptr++); break; case META_ESCAPE: escape = META_DATA(meta); switch(escape) { case ESC_d: for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_digit]; break; case ESC_D: should_flip_negation = TRUE; for (int i = 0; i < 32; i++) classbits[i] |= (uint8_t)(~cbits[i+cbit_digit]); break; case ESC_w: for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_word]; break; case ESC_W: should_flip_negation = TRUE; for (int i = 0; i < 32; i++) classbits[i] |= (uint8_t)(~cbits[i+cbit_word]); break; /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl 5.18. Before PCRE 8.34, we had to preserve the VT bit if it was previously set by something earlier in the character class. Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so we could just adjust the appropriate bit. From PCRE 8.34 we no longer treat \s and \S specially. */ case ESC_s: for (int i = 0; i < 32; i++) classbits[i] |= cbits[i+cbit_space]; break; case ESC_S: should_flip_negation = TRUE; for (int i = 0; i < 32; i++) classbits[i] |= (uint8_t)(~cbits[i+cbit_space]); break; /* When adding the horizontal or vertical space lists to a class, or their complements, disable PCRE2_CASELESS, because it justs wastes time, and in the "not-x" UTF cases can create unwanted duplicates in the XCLASS list (provoked by characters that have more than one other case and by both cases being in the same "not-x" sublist). */ case ESC_h: #if PCRE2_CODE_UNIT_WIDTH == 8 #ifdef SUPPORT_UNICODE if (cranges != NULL) break; #endif add_list_to_class(options & ~PCRE2_CASELESS, xoptions, cb, PRIV(hspace_list)); #else PCRE2_ASSERT(cranges != NULL); #endif break; case ESC_H: #if PCRE2_CODE_UNIT_WIDTH == 8 #ifdef SUPPORT_UNICODE if (cranges != NULL) break; #endif add_not_list_to_class(options & ~PCRE2_CASELESS, xoptions, cb, PRIV(hspace_list)); #else PCRE2_ASSERT(cranges != NULL); #endif break; case ESC_v: #if PCRE2_CODE_UNIT_WIDTH == 8 #ifdef SUPPORT_UNICODE if (cranges != NULL) break; #endif add_list_to_class(options & ~PCRE2_CASELESS, xoptions, cb, PRIV(vspace_list)); #else PCRE2_ASSERT(cranges != NULL); #endif break; case ESC_V: #if PCRE2_CODE_UNIT_WIDTH == 8 #ifdef SUPPORT_UNICODE if (cranges != NULL) break; #endif add_not_list_to_class(options & ~PCRE2_CASELESS, xoptions, cb, PRIV(vspace_list)); #else PCRE2_ASSERT(cranges != NULL); #endif break; /* If Unicode is not supported, \P and \p are not allowed and are faulted at parse time, so will never appear here. */ #ifdef SUPPORT_UNICODE case ESC_p: case ESC_P: { uint32_t ptype = *pptr >> 16; uint32_t pdata = *(pptr++) & 0xffff; /* The "Any" is processed by PRIV(update_classbits)(). */ if (ptype == PT_ANY) { #if PCRE2_CODE_UNIT_WIDTH == 8 if (!utf && escape == ESC_p) memset(classbits, 0xff, 32); #endif continue; } PRIV(update_classbits)(ptype, pdata, (escape == ESC_P), classbits); if ((xclass_props & XCLASS_HIGH_ANY) == 0) { if (lengthptr != NULL) *lengthptr += 3; else { *class_uchardata++ = (escape == ESC_p)? XCL_PROP : XCL_NOTPROP; *class_uchardata++ = ptype; *class_uchardata++ = pdata; } xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_PROPS; } } continue; #endif } #ifdef SUPPORT_WIDE_CHARS /* Every non-property class contains at least one < 256 character. */ xclass_props |= XCLASS_HAS_8BIT_CHARS; #endif /* End handling \d-type escapes */ continue; CLASS_END_CASES(meta) /* Literals. */ if (meta < META_END) break; /* Non-literals: end of class contents. */ goto END_PROCESSING; } /* A literal character may be followed by a range meta. At parse time there are checks for out-of-order characters, for ranges where the two characters are equal, and for hyphens that cannot indicate a range. At this point, therefore, no checking is needed. */ c = meta; /* Remember if \r or \n were explicitly used */ if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF; /* Process a character range */ if (*pptr == META_RANGE_LITERAL || *pptr == META_RANGE_ESCAPED) { uint32_t d; #ifdef EBCDIC BOOL range_is_literal = (*pptr == META_RANGE_LITERAL); #endif ++pptr; d = *(pptr++); if (d == META_BIGVALUE) d = *(pptr++); /* Remember an explicit \r or \n, and add the range to the class. */ if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF; #if PCRE2_CODE_UNIT_WIDTH == 8 #ifdef SUPPORT_UNICODE if (cranges != NULL) continue; xclass_props |= XCLASS_HAS_8BIT_CHARS; #endif /* In an EBCDIC environment, Perl treats alphabetic ranges specially because there are holes in the encoding, and simply using the range A-Z (for example) would include the characters in the holes. This applies only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */ #ifdef EBCDIC if (range_is_literal && (cb->ctypes[c] & ctype_letter) != 0 && (cb->ctypes[d] & ctype_letter) != 0 && (c <= CHAR_z) == (d <= CHAR_z)) { uint32_t uc = (d <= CHAR_z)? 0 : 64; uint32_t C = c - uc; uint32_t D = d - uc; if (C <= CHAR_i) { add_to_class(options, xoptions, cb, C + uc, ((D < CHAR_i)? D : CHAR_i) + uc); C = CHAR_j; } if (C <= D && C <= CHAR_r) { add_to_class(options, xoptions, cb, C + uc, ((D < CHAR_r)? D : CHAR_r) + uc); C = CHAR_s; } if (C <= D) add_to_class(options, xoptions, cb, C + uc, D + uc); } else #endif /* Not an EBCDIC special range */ add_to_class(options, xoptions, cb, c, d); #else PCRE2_ASSERT(cranges != NULL); #endif continue; } /* End of range handling */ /* Character ranges are ignored when class_ranges is present. */ #if PCRE2_CODE_UNIT_WIDTH == 8 #ifdef SUPPORT_UNICODE if (cranges != NULL) continue; xclass_props |= XCLASS_HAS_8BIT_CHARS; #endif /* Handle a single character. */ add_to_class(options, xoptions, cb, meta, meta); #else PCRE2_ASSERT(cranges != NULL); #endif } /* End of main class-processing loop */ END_PROCESSING: #ifdef SUPPORT_WIDE_CHARS PCRE2_ASSERT((xclass_props & XCLASS_HAS_PROPS) == 0 || (xclass_props & XCLASS_HIGH_ANY) == 0); if (cranges != NULL) { uint32_t *range = (uint32_t*)(cranges + 1); uint32_t *end = range + cranges->range_list_size; while (range < end && range[0] < 256) { PCRE2_ASSERT((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0); /* Add range to bitset. If we are in UTF or UCP mode, then clear the caseless bit, because the cranges handle caselessness (only) in this condition; see the condition for PARSE_CLASS_CASELESS_UTF in compile_optimize_class(). */ add_to_class(((options & (PCRE2_UTF|PCRE2_UCP)) != 0)? (options & ~PCRE2_CASELESS) : options, xoptions, cb, range[0], range[1]); if (range[1] > 255) break; range += 2; } if (cranges->char_lists_size > 0) { /* The cranges structure is still used and freed later. */ PCRE2_ASSERT((xclass_props & XCLASS_HIGH_ANY) == 0); xclass_props |= XCLASS_REQUIRED | XCLASS_HAS_CHAR_LISTS; } else { if ((xclass_props & XCLASS_HIGH_ANY) != 0) { PCRE2_ASSERT(range + 2 == end && range[0] <= 256 && range[1] >= GET_MAX_CHAR_VALUE(utf)); should_flip_negation = TRUE; range = end; } while (range < end) { uint32_t range_start = range[0]; uint32_t range_end = range[1]; range += 2; xclass_props |= XCLASS_REQUIRED; if (range_start < 256) range_start = 256; if (lengthptr != NULL) { #ifdef SUPPORT_UNICODE if (utf) { *lengthptr += 1; if (range_start < range_end) *lengthptr += PRIV(ord2utf)(range_start, class_uchardata); *lengthptr += PRIV(ord2utf)(range_end, class_uchardata); continue; } #endif /* SUPPORT_UNICODE */ *lengthptr += range_start < range_end ? 3 : 2; continue; } #ifdef SUPPORT_UNICODE if (utf) { if (range_start < range_end) { *class_uchardata++ = XCL_RANGE; class_uchardata += PRIV(ord2utf)(range_start, class_uchardata); } else *class_uchardata++ = XCL_SINGLE; class_uchardata += PRIV(ord2utf)(range_end, class_uchardata); continue; } #endif /* SUPPORT_UNICODE */ /* Without UTF support, character values are constrained by the bit length, and can only be > 256 for 16-bit and 32-bit libraries. */ #if PCRE2_CODE_UNIT_WIDTH != 8 if (range_start < range_end) { *class_uchardata++ = XCL_RANGE; *class_uchardata++ = range_start; } else *class_uchardata++ = XCL_SINGLE; *class_uchardata++ = range_end; #endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ } if (lengthptr == NULL) cb->cx->memctl.free(cranges, cb->cx->memctl.memory_data); } } #endif /* SUPPORT_WIDE_CHARS */ /* If there are characters with values > 255, or Unicode property settings (\p or \P), we have to compile an extended class, with its own opcode, unless there were no property settings and there was a negated special such as \S in the class, and PCRE2_UCP is not set, because in that case all characters > 255 are in or not in the class, so any that were explicitly given as well can be ignored. In the UCP case, if certain negated POSIX classes (ex: [:^ascii:]) were were present in a class, we either have to match or not match all wide characters (depending on whether the whole class is or is not negated). This requirement is indicated by match_all_or_no_wide_chars being true. We do this by including an explicit range, which works in both cases. This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there cannot be any wide characters in 8-bit non-UTF mode. When there *are* properties in a positive UTF-8 or any 16-bit or 32_bit class where \S etc is present without PCRE2_UCP, causing an extended class to be compiled, we make sure that all characters > 255 are included by forcing match_all_or_no_wide_chars to be true. If, when generating an xclass, there are no characters < 256, we can omit the bitmap in the actual compiled code. */ #ifdef SUPPORT_WIDE_CHARS /* Defined for 16/32 bits, or 8-bit with Unicode */ if ((xclass_props & XCLASS_REQUIRED) != 0) { PCRE2_UCHAR *previous = code; if ((xclass_props & XCLASS_HAS_CHAR_LISTS) == 0) *class_uchardata++ = XCL_END; /* Marks the end of extra data */ *code++ = OP_XCLASS; code += LINK_SIZE; *code = negate_class? XCL_NOT:0; if ((xclass_props & XCLASS_HAS_PROPS) != 0) *code |= XCL_HASPROP; /* If the map is required, move up the extra data to make room for it; otherwise just move the code pointer to the end of the extra data. */ if ((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0 || has_bitmap != NULL) { if (negate_class) { uint32_t *classwords = cb->classbits.classwords; for (int i = 0; i < 8; i++) classwords[i] = ~classwords[i]; } if (has_bitmap == NULL) { *code++ |= XCL_MAP; (void)memmove(code + (32 / sizeof(PCRE2_UCHAR)), code, CU2BYTES(class_uchardata - code)); memcpy(code, classbits, 32); code = class_uchardata + (32 / sizeof(PCRE2_UCHAR)); } else { code = class_uchardata; if ((xclass_props & XCLASS_HAS_8BIT_CHARS) != 0) *has_bitmap = TRUE; } } else code = class_uchardata; if ((xclass_props & XCLASS_HAS_CHAR_LISTS) != 0) { /* Char lists size is an even number, because all items are 16 or 32 bit values. The character list data is always aligned to 32 bits. */ size_t char_lists_size = cranges->char_lists_size; PCRE2_ASSERT((char_lists_size & 0x1) == 0 && (cb->char_lists_size & 0x3) == 0); if (lengthptr != NULL) { char_lists_size = CLIST_ALIGN_TO(char_lists_size, sizeof(uint32_t)); #if PCRE2_CODE_UNIT_WIDTH == 8 *lengthptr += 2 + LINK_SIZE; #else *lengthptr += 1 + LINK_SIZE; #endif cb->char_lists_size += char_lists_size; char_lists_size /= sizeof(PCRE2_UCHAR); /* Storage space for character lists is included in the maximum pattern size. */ if (*lengthptr > MAX_PATTERN_SIZE || MAX_PATTERN_SIZE - *lengthptr < char_lists_size) { *errorcodeptr = ERR20; /* Pattern is too large */ return NULL; } } else { uint8_t *data; PCRE2_ASSERT(cranges->char_lists_types <= XCL_TYPE_MASK); #if PCRE2_CODE_UNIT_WIDTH == 8 /* Encode as high / low bytes. */ code[0] = (uint8_t)(XCL_LIST | (cranges->char_lists_types >> 8)); code[1] = (uint8_t)cranges->char_lists_types; code += 2; #else *code++ = (PCRE2_UCHAR)(XCL_LIST | cranges->char_lists_types); #endif /* Character lists are stored in backwards direction from byte code start. The non-dfa/dfa matchers can access these lists using the byte code start stored in match blocks. Each list is aligned to 32 bit with an optional unused 16 bit value at the beginning of the character list. */ cb->char_lists_size += char_lists_size; data = (uint8_t*)cb->start_code - cb->char_lists_size; memcpy(data, (uint8_t*)(cranges + 1) + cranges->char_lists_start, char_lists_size); /* Since character lists total size is less than MAX_PATTERN_SIZE, their starting offset fits into a value which size is LINK_SIZE. */ char_lists_size = cb->char_lists_size; PUT(code, 0, (uint32_t)(char_lists_size >> 1)); code += LINK_SIZE; #if defined PCRE2_DEBUG || defined SUPPORT_VALGRIND if ((char_lists_size & 0x2) != 0) { /* In debug the unused 16 bit value is set to a fixed value and marked unused. */ ((uint16_t*)data)[-1] = 0x5555; #ifdef SUPPORT_VALGRIND VALGRIND_MAKE_MEM_NOACCESS(data - 2, 2); #endif } #endif cb->char_lists_size = CLIST_ALIGN_TO(char_lists_size, sizeof(uint32_t)); cb->cx->memctl.free(cranges, cb->cx->memctl.memory_data); } } /* Now fill in the complete length of the item */ PUT(previous, 1, (int)(code - previous)); goto DONE; /* End of class handling */ } #endif /* SUPPORT_WIDE_CHARS */ /* If there are no characters > 255, or they are all to be included or excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the whole class was negated and whether there were negative specials such as \S (non-UCP) in the class. Then copy the 32-byte map into the code vector, negating it if necessary. */ if (negate_class) { uint32_t *classwords = cb->classbits.classwords; for (int i = 0; i < 8; i++) classwords[i] = ~classwords[i]; } if ((SELECT_VALUE8(!utf, 0) || negate_class != should_flip_negation) && cb->classbits.classwords[0] == ~(uint32_t)0) { const uint32_t *classwords = cb->classbits.classwords; int i; for (i = 0; i < 8; i++) if (classwords[i] != ~(uint32_t)0) break; if (i == 8) { *code++ = OP_ALLANY; goto DONE; /* End of class handling */ } } *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS; memcpy(code, classbits, 32); code += 32 / sizeof(PCRE2_UCHAR); DONE: *pcode = code; return pptr - 1; } /* ===================================================================*/ /* Here follows a block of ECLASS-compiling functions. You may well want to read them from top to bottom; they are ordered from leafmost (at the top) to outermost parser (at the bottom of the file). */ /* This function folds one operand using the negation operator. The new, combined chunk of stack code is written out to *pop_info. */ static void fold_negation(eclass_op_info *pop_info, PCRE2_SIZE *lengthptr, BOOL preserve_classbits) { /* If the chunk of stack code is already composed of multiple ops, we won't descend in and try and propagate the negation down the tree. (That would lead to O(n^2) compile-time, which could be exploitable with a malicious regex - although maybe that's not really too much of a worry in a library that offers an exponential-time matching function!) */ if (pop_info->op_single_type == 0) { if (lengthptr != NULL) *lengthptr += 1; else pop_info->code_start[pop_info->length] = ECL_NOT; pop_info->length += 1; } /* Otherwise, it's a nice single-op item, so we can easily fold in the negation without needing to produce an ECL_NOT. */ else if (pop_info->op_single_type == ECL_ANY || pop_info->op_single_type == ECL_NONE) { pop_info->op_single_type = (pop_info->op_single_type == ECL_NONE)? ECL_ANY : ECL_NONE; if (lengthptr == NULL) *(pop_info->code_start) = pop_info->op_single_type; } else { PCRE2_ASSERT(pop_info->op_single_type == ECL_XCLASS && pop_info->length >= 1 + LINK_SIZE + 1); if (lengthptr == NULL) pop_info->code_start[1 + LINK_SIZE] ^= XCL_NOT; } if (!preserve_classbits) { for (int i = 0; i < 8; i++) pop_info->bits.classwords[i] = ~pop_info->bits.classwords[i]; } } /* This function folds together two operands using a binary operator. The new, combined chunk of stack code is written out to *lhs_op_info. */ static void fold_binary(int op, eclass_op_info *lhs_op_info, eclass_op_info *rhs_op_info, PCRE2_SIZE *lengthptr) { switch (op) { /* ECL_AND truth table: LHS RHS RESULT ---------------- ANY * RHS * ANY LHS NONE * NONE * NONE NONE X Y X & Y */ case ECL_AND: if (rhs_op_info->op_single_type == ECL_ANY) { /* no-op: drop the RHS */ } else if (lhs_op_info->op_single_type == ECL_ANY) { /* no-op: drop the LHS, and memmove the RHS into its place */ if (lengthptr == NULL) memmove(lhs_op_info->code_start, rhs_op_info->code_start, CU2BYTES(rhs_op_info->length)); lhs_op_info->length = rhs_op_info->length; lhs_op_info->op_single_type = rhs_op_info->op_single_type; } else if (rhs_op_info->op_single_type == ECL_NONE) { /* the result is ECL_NONE: write into the LHS */ if (lengthptr == NULL) lhs_op_info->code_start[0] = ECL_NONE; lhs_op_info->length = 1; lhs_op_info->op_single_type = ECL_NONE; } else if (lhs_op_info->op_single_type == ECL_NONE) { /* the result is ECL_NONE: drop the RHS */ } else { /* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */ if (lengthptr != NULL) *lengthptr += 1; else { PCRE2_ASSERT(rhs_op_info->code_start == lhs_op_info->code_start + lhs_op_info->length); rhs_op_info->code_start[rhs_op_info->length] = ECL_AND; } lhs_op_info->length += rhs_op_info->length + 1; lhs_op_info->op_single_type = 0; } for (int i = 0; i < 8; i++) lhs_op_info->bits.classwords[i] &= rhs_op_info->bits.classwords[i]; break; /* ECL_OR truth table: LHS RHS RESULT ---------------- ANY * ANY * ANY ANY NONE * RHS * NONE LHS X Y X | Y */ case ECL_OR: if (rhs_op_info->op_single_type == ECL_NONE) { /* no-op: drop the RHS */ } else if (lhs_op_info->op_single_type == ECL_NONE) { /* no-op: drop the LHS, and memmove the RHS into its place */ if (lengthptr == NULL) memmove(lhs_op_info->code_start, rhs_op_info->code_start, CU2BYTES(rhs_op_info->length)); lhs_op_info->length = rhs_op_info->length; lhs_op_info->op_single_type = rhs_op_info->op_single_type; } else if (rhs_op_info->op_single_type == ECL_ANY) { /* the result is ECL_ANY: write into the LHS */ if (lengthptr == NULL) lhs_op_info->code_start[0] = ECL_ANY; lhs_op_info->length = 1; lhs_op_info->op_single_type = ECL_ANY; } else if (lhs_op_info->op_single_type == ECL_ANY) { /* the result is ECL_ANY: drop the RHS */ } else { /* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */ if (lengthptr != NULL) *lengthptr += 1; else { PCRE2_ASSERT(rhs_op_info->code_start == lhs_op_info->code_start + lhs_op_info->length); rhs_op_info->code_start[rhs_op_info->length] = ECL_OR; } lhs_op_info->length += rhs_op_info->length + 1; lhs_op_info->op_single_type = 0; } for (int i = 0; i < 8; i++) lhs_op_info->bits.classwords[i] |= rhs_op_info->bits.classwords[i]; break; /* ECL_XOR truth table: LHS RHS RESULT ---------------- ANY * !RHS * ANY !LHS NONE * RHS * NONE LHS X Y X ^ Y */ case ECL_XOR: if (rhs_op_info->op_single_type == ECL_NONE) { /* no-op: drop the RHS */ } else if (lhs_op_info->op_single_type == ECL_NONE) { /* no-op: drop the LHS, and memmove the RHS into its place */ if (lengthptr == NULL) memmove(lhs_op_info->code_start, rhs_op_info->code_start, CU2BYTES(rhs_op_info->length)); lhs_op_info->length = rhs_op_info->length; lhs_op_info->op_single_type = rhs_op_info->op_single_type; } else if (rhs_op_info->op_single_type == ECL_ANY) { /* the result is !LHS: fold in the negation, and drop the RHS */ /* Preserve the classbits, because we promise to deal with them later. */ fold_negation(lhs_op_info, lengthptr, TRUE); } else if (lhs_op_info->op_single_type == ECL_ANY) { /* the result is !RHS: drop the LHS, memmove the RHS into its place, and fold in the negation */ if (lengthptr == NULL) memmove(lhs_op_info->code_start, rhs_op_info->code_start, CU2BYTES(rhs_op_info->length)); lhs_op_info->length = rhs_op_info->length; lhs_op_info->op_single_type = rhs_op_info->op_single_type; /* Preserve the classbits, because we promise to deal with them later. */ fold_negation(lhs_op_info, lengthptr, TRUE); } else { /* Both of LHS & RHS are either ECL_XCLASS, or compound operations. */ if (lengthptr != NULL) *lengthptr += 1; else { PCRE2_ASSERT(rhs_op_info->code_start == lhs_op_info->code_start + lhs_op_info->length); rhs_op_info->code_start[rhs_op_info->length] = ECL_XOR; } lhs_op_info->length += rhs_op_info->length + 1; lhs_op_info->op_single_type = 0; } for (int i = 0; i < 8; i++) lhs_op_info->bits.classwords[i] ^= rhs_op_info->bits.classwords[i]; break; default: PCRE2_DEBUG_UNREACHABLE(); break; } } static BOOL compile_eclass_nested(eclass_context *context, BOOL negated, uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info, PCRE2_SIZE *lengthptr); /* This function consumes a group of implicitly-unioned class elements. These can be characters, ranges, properties, or nested classes, as long as they are all joined by being placed adjacently. */ static BOOL compile_class_operand(eclass_context *context, BOOL negated, uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info, PCRE2_SIZE *lengthptr) { uint32_t *ptr = *pptr; uint32_t *prev_ptr; PCRE2_UCHAR *code = *pcode; PCRE2_UCHAR *code_start = code; PCRE2_SIZE prev_length = (lengthptr != NULL)? *lengthptr : 0; PCRE2_SIZE extra_length; uint32_t meta = META_CODE(*ptr); switch (meta) { case META_CLASS_EMPTY_NOT: case META_CLASS_EMPTY: ++ptr; pop_info->length = 1; if ((meta == META_CLASS_EMPTY) == negated) { *code++ = pop_info->op_single_type = ECL_ANY; memset(pop_info->bits.classbits, 0xff, 32); } else { *code++ = pop_info->op_single_type = ECL_NONE; memset(pop_info->bits.classbits, 0, 32); } break; case META_CLASS: case META_CLASS_NOT: if ((*ptr & CLASS_IS_ECLASS) != 0) { if (!compile_eclass_nested(context, negated, &ptr, &code, pop_info, lengthptr)) return FALSE; PCRE2_ASSERT(*ptr == META_CLASS_END); ptr++; goto DONE; } ptr++; /* Fall through */ default: /* Scan forward characters, ranges, and properties. For example: inside [a-z_ -- m] we don't have brackets around "a-z_" but we still need to collect that fragment up into a "leaf" OP_CLASS. */ prev_ptr = ptr; ptr = PRIV(compile_class_not_nested)( context->options, context->xoptions, ptr, &code, (meta != META_CLASS_NOT) == negated, &context->needs_bitmap, context->errorcodeptr, context->cb, lengthptr); if (ptr == NULL) return FALSE; /* We must have a 100% guarantee that ptr increases when compile_class_operand() returns, even on Release builds, so that we can statically prove our loops terminate. */ if (ptr <= prev_ptr) { PCRE2_DEBUG_UNREACHABLE(); return FALSE; } /* If we fell through above, consume the closing ']'. */ if (meta == META_CLASS || meta == META_CLASS_NOT) { PCRE2_ASSERT(*ptr == META_CLASS_END); ptr++; } /* Regardless of whether (lengthptr == NULL), some data will still be written out to *pcode, which we need: we have to peek at it, to transform the opcode into the ECLASS version (since we need to hoist up the bitmaps). */ PCRE2_ASSERT(code > code_start); extra_length = (lengthptr != NULL)? *lengthptr - prev_length : 0; /* Easiest case: convert OP_ALLANY to ECL_ANY */ if (*code_start == OP_ALLANY) { PCRE2_ASSERT(code - code_start == 1 && extra_length == 0); pop_info->length = 1; *code_start = pop_info->op_single_type = ECL_ANY; memset(pop_info->bits.classbits, 0xff, 32); } /* For OP_CLASS and OP_NCLASS, we hoist out the bitmap and convert to ECL_NONE / ECL_ANY respectively. */ else if (*code_start == OP_CLASS || *code_start == OP_NCLASS) { PCRE2_ASSERT(code - code_start == 1 + 32 / sizeof(PCRE2_UCHAR) && extra_length == 0); pop_info->length = 1; *code_start = pop_info->op_single_type = (*code_start == OP_CLASS)? ECL_NONE : ECL_ANY; memcpy(pop_info->bits.classbits, code_start + 1, 32); /* Rewind the code pointer, but make sure we adjust *lengthptr, because we do need to reserve that space (even though we only use it temporarily). */ if (lengthptr != NULL) *lengthptr += code - (code_start + 1); code = code_start + 1; if (!context->needs_bitmap && *code_start == ECL_NONE) { uint32_t *classwords = pop_info->bits.classwords; for (int i = 0; i < 8; i++) if (classwords[i] != 0) { context->needs_bitmap = TRUE; break; } } else context->needs_bitmap = TRUE; } /* Finally, for OP_XCLASS we hoist out the bitmap (if any), and convert to ECL_XCLASS. */ else { PCRE2_ASSERT(*code_start == OP_XCLASS); *code_start = pop_info->op_single_type = ECL_XCLASS; PCRE2_ASSERT(code - code_start >= 1 + LINK_SIZE + 1); memcpy(pop_info->bits.classbits, context->cb->classbits.classbits, 32); pop_info->length = (code - code_start) + extra_length; } break; } /* End of switch(meta) */ pop_info->code_start = (lengthptr == NULL)? code_start : NULL; if (lengthptr != NULL) { *lengthptr += code - code_start; code = code_start; } DONE: PCRE2_ASSERT(lengthptr == NULL || (code == code_start)); *pptr = ptr; *pcode = code; return TRUE; } /* This function consumes a group of implicitly-unioned class elements. These can be characters, ranges, properties, or nested classes, as long as they are all joined by being placed adjacently. */ static BOOL compile_class_juxtaposition(eclass_context *context, BOOL negated, uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info, PCRE2_SIZE *lengthptr) { uint32_t *ptr = *pptr; PCRE2_UCHAR *code = *pcode; #ifdef PCRE2_DEBUG PCRE2_UCHAR *start_code = *pcode; #endif /* See compile_class_binary_loose() for comments on compile-time folding of the "negated" flag. */ /* Because it's a non-empty class, there must be an operand at the start. */ if (!compile_class_operand(context, negated, &ptr, &code, pop_info, lengthptr)) return FALSE; while (*ptr != META_CLASS_END && !(*ptr >= META_ECLASS_AND && *ptr <= META_ECLASS_NOT)) { uint32_t op; BOOL rhs_negated; eclass_op_info rhs_op_info; if (negated) { /* !(A juxtapose B) -> !A && !B */ op = ECL_AND; rhs_negated = TRUE; } else { /* A juxtapose B -> A || B */ op = ECL_OR; rhs_negated = FALSE; } /* An operand must follow the operator. */ if (!compile_class_operand(context, rhs_negated, &ptr, &code, &rhs_op_info, lengthptr)) return FALSE; /* Convert infix to postfix (RPN). */ fold_binary(op, pop_info, &rhs_op_info, lengthptr); if (lengthptr == NULL) code = pop_info->code_start + pop_info->length; } PCRE2_ASSERT(lengthptr == NULL || code == start_code); *pptr = ptr; *pcode = code; return TRUE; } /* This function consumes unary prefix operators. */ static BOOL compile_class_unary(eclass_context *context, BOOL negated, uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info, PCRE2_SIZE *lengthptr) { uint32_t *ptr = *pptr; #ifdef PCRE2_DEBUG PCRE2_UCHAR *start_code = *pcode; #endif while (*ptr == META_ECLASS_NOT) { ++ptr; negated = !negated; } *pptr = ptr; /* Because it's a non-empty class, there must be an operand. */ if (!compile_class_juxtaposition(context, negated, pptr, pcode, pop_info, lengthptr)) return FALSE; PCRE2_ASSERT(lengthptr == NULL || *pcode == start_code); return TRUE; } /* This function consumes tightly-binding binary operators. */ static BOOL compile_class_binary_tight(eclass_context *context, BOOL negated, uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info, PCRE2_SIZE *lengthptr) { uint32_t *ptr = *pptr; PCRE2_UCHAR *code = *pcode; #ifdef PCRE2_DEBUG PCRE2_UCHAR *start_code = *pcode; #endif /* See compile_class_binary_loose() for comments on compile-time folding of the "negated" flag. */ /* Because it's a non-empty class, there must be an operand at the start. */ if (!compile_class_unary(context, negated, &ptr, &code, pop_info, lengthptr)) return FALSE; while (*ptr == META_ECLASS_AND) { uint32_t op; BOOL rhs_negated; eclass_op_info rhs_op_info; if (negated) { /* !(A && B) -> !A || !B */ op = ECL_OR; rhs_negated = TRUE; } else { /* A && B -> A && B */ op = ECL_AND; rhs_negated = FALSE; } ++ptr; /* An operand must follow the operator. */ if (!compile_class_unary(context, rhs_negated, &ptr, &code, &rhs_op_info, lengthptr)) return FALSE; /* Convert infix to postfix (RPN). */ fold_binary(op, pop_info, &rhs_op_info, lengthptr); if (lengthptr == NULL) code = pop_info->code_start + pop_info->length; } PCRE2_ASSERT(lengthptr == NULL || code == start_code); *pptr = ptr; *pcode = code; return TRUE; } /* This function consumes loosely-binding binary operators. */ static BOOL compile_class_binary_loose(eclass_context *context, BOOL negated, uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info, PCRE2_SIZE *lengthptr) { uint32_t *ptr = *pptr; PCRE2_UCHAR *code = *pcode; #ifdef PCRE2_DEBUG PCRE2_UCHAR *start_code = *pcode; #endif /* We really want to fold the negation operator, if at all possible, so that simple cases can be reduced down. In particular, in 8-bit no-UTF mode, we want to produce a fully-folded expression, so that we can guarantee not to emit any OP_ECLASS codes (in the same way that we never emit OP_XCLASS in this mode). This has the consequence that with a little ingenuity, we can in fact avoid emitting (nearly...) all cases of the "NOT" operator. Imagine that we have: !(A ... We have parsed the preceding "!", and we are about to parse the "A" operand. We don't know yet whether there will even be a following binary operand! Both of these are possibilities for what follows: !(A && B) !(A) However, we can still fold the "!" into the "A" operand, because no matter what the following binary operator will be, we can produce an expression which is equivalent. */ /* Because it's a non-empty class, there must be an operand at the start. */ if (!compile_class_binary_tight(context, negated, &ptr, &code, pop_info, lengthptr)) return FALSE; while (*ptr >= META_ECLASS_OR && *ptr <= META_ECLASS_XOR) { uint32_t op; BOOL op_neg; BOOL rhs_negated; eclass_op_info rhs_op_info; if (negated) { /* The whole expression is being negated; we respond by unconditionally negating the LHS A, before seeing what follows. And hooray! We can recover, no matter what follows. */ /* !(A || B) -> !A && !B */ /* !(A -- B) -> !(A && !B) -> !A || B */ /* !(A XOR B) -> !(!A XOR !B) -> !A XNOR !B */ op = (*ptr == META_ECLASS_OR )? ECL_AND : (*ptr == META_ECLASS_SUB)? ECL_OR : /*ptr == META_ECLASS_XOR*/ ECL_XOR; op_neg = (*ptr == META_ECLASS_XOR); rhs_negated = *ptr != META_ECLASS_SUB; } else { /* A || B -> A || B */ /* A -- B -> A && !B */ /* A XOR B -> A XOR B */ op = (*ptr == META_ECLASS_OR )? ECL_OR : (*ptr == META_ECLASS_SUB)? ECL_AND : /*ptr == META_ECLASS_XOR*/ ECL_XOR; op_neg = FALSE; rhs_negated = *ptr == META_ECLASS_SUB; } ++ptr; /* An operand must follow the operator. */ if (!compile_class_binary_tight(context, rhs_negated, &ptr, &code, &rhs_op_info, lengthptr)) return FALSE; /* Convert infix to postfix (RPN). */ fold_binary(op, pop_info, &rhs_op_info, lengthptr); if (op_neg) fold_negation(pop_info, lengthptr, FALSE); if (lengthptr == NULL) code = pop_info->code_start + pop_info->length; } PCRE2_ASSERT(lengthptr == NULL || code == start_code); *pptr = ptr; *pcode = code; return TRUE; } /* This function converts the META codes in pptr into opcodes written to pcode. The pptr must start at a META_CLASS or META_CLASS_NOT. The class is compiled as a left-associative sequence of operator applications. The pptr will be left pointing at the matching META_CLASS_END. */ static BOOL compile_eclass_nested(eclass_context *context, BOOL negated, uint32_t **pptr, PCRE2_UCHAR **pcode, eclass_op_info *pop_info, PCRE2_SIZE *lengthptr) { uint32_t *ptr = *pptr; #ifdef PCRE2_DEBUG PCRE2_UCHAR *start_code = *pcode; #endif /* The CLASS_IS_ECLASS bit must be set since it is a nested class. */ PCRE2_ASSERT(*ptr == (META_CLASS | CLASS_IS_ECLASS) || *ptr == (META_CLASS_NOT | CLASS_IS_ECLASS)); if (*ptr++ == (META_CLASS_NOT | CLASS_IS_ECLASS)) negated = !negated; (*pptr)++; /* Because it's a non-empty class, there must be an operand at the start. */ if (!compile_class_binary_loose(context, negated, pptr, pcode, pop_info, lengthptr)) return FALSE; PCRE2_ASSERT(**pptr == META_CLASS_END); PCRE2_ASSERT(lengthptr == NULL || *pcode == start_code); return TRUE; } BOOL PRIV(compile_class_nested)(uint32_t options, uint32_t xoptions, uint32_t **pptr, PCRE2_UCHAR **pcode, int *errorcodeptr, compile_block *cb, PCRE2_SIZE *lengthptr) { eclass_context context; eclass_op_info op_info; PCRE2_SIZE previous_length = (lengthptr != NULL)? *lengthptr : 0; PCRE2_UCHAR *code = *pcode; PCRE2_UCHAR *previous; BOOL allbitsone = TRUE; context.needs_bitmap = FALSE; context.options = options; context.xoptions = xoptions; context.errorcodeptr = errorcodeptr; context.cb = cb; previous = code; *code++ = OP_ECLASS; code += LINK_SIZE; *code++ = 0; /* Flags, currently zero. */ if (!compile_eclass_nested(&context, FALSE, pptr, &code, &op_info, lengthptr)) return FALSE; if (lengthptr != NULL) { *lengthptr += code - previous; code = previous; /* (*lengthptr - previous_length) now holds the amount of buffer that we require to make the call to compile_class_nested() with lengthptr = NULL, and including the (1+LINK_SIZE+1) that we write out before that call. */ } /* Do some useful counting of what's in the bitmap. */ for (int i = 0; i < 8; i++) if (op_info.bits.classwords[i] != 0xffffffff) { allbitsone = FALSE; break; } /* After constant-folding the extended class syntax, it may turn out to be a simple class after all. In that case, we can unwrap it from the OP_ECLASS container - and in fact, we must do so, because in 8-bit no-Unicode mode the matcher is compiled without support for OP_ECLASS. */ #ifndef SUPPORT_WIDE_CHARS PCRE2_ASSERT(op_info.op_single_type != 0); #else if (op_info.op_single_type != 0) #endif { /* Rewind back over the OP_ECLASS. */ code = previous; /* If the bits are all ones, and the "high characters" are all matched too, we use a special-cased encoding of OP_ALLANY. */ if (op_info.op_single_type == ECL_ANY && allbitsone) { /* Advancing code means rewinding lengthptr, at this point. */ if (lengthptr != NULL) *lengthptr -= 1; *code++ = OP_ALLANY; } /* If the high bits are all matched / all not-matched, then we emit an OP_NCLASS/OP_CLASS respectively. */ else if (op_info.op_single_type == ECL_ANY || op_info.op_single_type == ECL_NONE) { PCRE2_SIZE required_len = 1 + (32 / sizeof(PCRE2_UCHAR)); if (lengthptr != NULL) { if (required_len > (*lengthptr - previous_length)) *lengthptr = previous_length + required_len; } /* Advancing code means rewinding lengthptr, at this point. */ if (lengthptr != NULL) *lengthptr -= required_len; *code++ = (op_info.op_single_type == ECL_ANY)? OP_NCLASS : OP_CLASS; memcpy(code, op_info.bits.classbits, 32); code += 32 / sizeof(PCRE2_UCHAR); } /* Otherwise, we have an ECL_XCLASS, so we have the OP_XCLASS data there, but, we pulled out its bitmap into op_info, so now we have to put that back into the OP_XCLASS. */ else { #ifndef SUPPORT_WIDE_CHARS PCRE2_DEBUG_UNREACHABLE(); #else BOOL need_map = context.needs_bitmap; PCRE2_SIZE required_len; PCRE2_ASSERT(op_info.op_single_type == ECL_XCLASS); required_len = op_info.length + (need_map? 32/sizeof(PCRE2_UCHAR) : 0); if (lengthptr != NULL) { /* Don't unconditionally request all the space we need - we may already have asked for more during processing of the ECLASS. */ if (required_len > (*lengthptr - previous_length)) *lengthptr = previous_length + required_len; /* The code we write out here won't be ignored, even during the (lengthptr != NULL) phase, because if there's a following quantifier it will peek backwards. So we do have to write out a (truncated) OP_XCLASS, even on this branch. */ *lengthptr -= 1 + LINK_SIZE + 1; *code++ = OP_XCLASS; PUT(code, 0, 1 + LINK_SIZE + 1); code += LINK_SIZE; *code++ = 0; } else { PCRE2_UCHAR *rest; PCRE2_SIZE rest_len; PCRE2_UCHAR flags; /* 1 unit: OP_XCLASS | LINK_SIZE units | 1 unit: flags | ...rest */ PCRE2_ASSERT(op_info.length >= 1 + LINK_SIZE + 1); rest = op_info.code_start + 1 + LINK_SIZE + 1; rest_len = (op_info.code_start + op_info.length) - rest; /* First read any data we use, before memmove splats it. */ flags = op_info.code_start[1 + LINK_SIZE]; PCRE2_ASSERT((flags & XCL_MAP) == 0); /* Next do the memmove before any writes. */ memmove(code + 1 + LINK_SIZE + 1 + (need_map? 32/sizeof(PCRE2_UCHAR) : 0), rest, CU2BYTES(rest_len)); /* Finally write the header data. */ *code++ = OP_XCLASS; PUT(code, 0, (int)required_len); code += LINK_SIZE; *code++ = flags | (need_map? XCL_MAP : 0); if (need_map) { memcpy(code, op_info.bits.classbits, 32); code += 32 / sizeof(PCRE2_UCHAR); } code += rest_len; } #endif /* SUPPORT_WIDE_CHARS */ } } /* Otherwise, we're going to keep the OP_ECLASS. However, again we need to do some adjustment to insert the bitmap if we have one. */ #ifdef SUPPORT_WIDE_CHARS else { BOOL need_map = context.needs_bitmap; PCRE2_SIZE required_len = 1 + LINK_SIZE + 1 + (need_map? 32/sizeof(PCRE2_UCHAR) : 0) + op_info.length; if (lengthptr != NULL) { if (required_len > (*lengthptr - previous_length)) *lengthptr = previous_length + required_len; /* As for the XCLASS branch above, we do have to write out a dummy OP_ECLASS, because of the backwards peek by the quantifier code. Write out a (truncated) OP_ECLASS, even on this branch. */ *lengthptr -= 1 + LINK_SIZE + 1; *code++ = OP_ECLASS; PUT(code, 0, 1 + LINK_SIZE + 1); code += LINK_SIZE; *code++ = 0; } else { if (need_map) { PCRE2_UCHAR *map_start = previous + 1 + LINK_SIZE + 1; previous[1 + LINK_SIZE] |= ECL_MAP; memmove(map_start + 32/sizeof(PCRE2_UCHAR), map_start, CU2BYTES(code - map_start)); memcpy(map_start, op_info.bits.classbits, 32); code += 32 / sizeof(PCRE2_UCHAR); } PUT(previous, 1, (int)(code - previous)); } } #endif /* SUPPORT_WIDE_CHARS */ *pcode = code; return TRUE; } /* End of pcre2_compile_class.c */