-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathutf8_decode_next.h
More file actions
118 lines (107 loc) · 4 KB
/
utf8_decode_next.h
File metadata and controls
118 lines (107 loc) · 4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
/*
* Copyright (c) 2026 Christian Hansen <chansen@cpan.org>
* <https://github.com/chansen/c-utf8>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#ifndef UTF8_DECODE_NEXT_H
#define UTF8_DECODE_NEXT_H
#include <stddef.h>
#include <stdint.h>
#ifndef UTF8_DFA64_H
# error "include utf8_dfa64.h before utf8_decode_next.h"
#endif
#ifdef __cplusplus
extern "C" {
#endif
/*
* utf8_decode_next -- decode one codepoint from src[0..len).
*
* On success: returns bytes consumed (1-4) and writes the codepoint
* to *codepoint.
* At end: returns 0, *codepoint is unchanged.
* On error: returns the negated length of the maximal subpart (always
* negative, in the range -1..-3). *codepoint is unchanged.
* The caller should advance by -return_value bytes before
* calling again.
*
* The maximal subpart of an ill-formed subsequence is defined by Unicode:
* the longest prefix starting at the ill-formed offset that is either the
* initial subsequence of a well-formed sequence, or a single code unit.
* Each maximal subpart produces one U+FFFD substitution character.
*/
static inline int utf8_decode_next(const char* src,
size_t len,
uint32_t* codepoint) {
if (len == 0)
return 0;
const uint8_t* bytes = (const uint8_t*)src;
utf8_dfa_state_t state = UTF8_DFA_ACCEPT;
uint32_t cp = 0;
size_t pos = 0;
do {
state = utf8_dfa_step_decode(state, bytes[pos++], &cp);
if (state == UTF8_DFA_ACCEPT) {
*codepoint = cp;
return (int)pos;
}
if (state == UTF8_DFA_REJECT) {
/* The byte at bytes[pos-1] triggered rejection. If it was the first
* byte, it is itself the maximal subpart (length 1). Otherwise
* the lead byte(s) already consumed form the maximal subpart
* and the triggering byte belongs to the next sequence. */
return -(int)(pos > 1 ? pos - 1 : 1);
}
} while (pos < len);
// Truncated sequence: maximal subpart is the bytes consumed so far
return -(int)pos;
}
/*
* utf8_decode_next_replace -- like utf8_decode_next but on error writes
* U+FFFD to *codepoint and returns the maximal subpart length as a
* positive value. Never returns a negative value. Returns 0 only when
* len is 0.
*/
static inline int utf8_decode_next_replace(const char* src,
size_t len,
uint32_t* codepoint) {
if (len == 0)
return 0;
const uint8_t* bytes = (const uint8_t*)src;
utf8_dfa_state_t state = UTF8_DFA_ACCEPT;
uint32_t cp = 0;
size_t pos = 0;
do {
state = utf8_dfa_step_decode(state, bytes[pos++], &cp);
if (state == UTF8_DFA_ACCEPT) {
*codepoint = cp;
return (int)pos;
}
if (state == UTF8_DFA_REJECT) {
*codepoint = 0xFFFDu;
return (int)(pos > 1 ? pos - 1 : 1);
}
} while (pos < len);
*codepoint = 0xFFFDu;
return (int)pos;
}
#ifdef __cplusplus
}
#endif
#endif /* UTF8_DECODE_NEXT_H */