c-utf8/utf8_decode_next.h at master · chansen/c-utf8 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
/*
 * Copyright (c) 2026 Christian Hansen <chansen@cpan.org>
 * <https://github.com/chansen/c-utf8>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
#ifndef UTF8_DECODE_NEXT_H
#define UTF8_DECODE_NEXT_H
#include <stddef.h>
#include <stdint.h>

#ifndef UTF8_DFA64_H
#  error "include utf8_dfa64.h before utf8_decode_next.h"
#endif

#ifdef __cplusplus
extern "C" {
#endif

/*
 * utf8_decode_next -- decode one codepoint from src[0..len).
 *
 * On success:   returns bytes consumed (1-4) and writes the codepoint
 *               to *codepoint.
 * At end:       returns 0, *codepoint is unchanged.
 * On error:     returns the negated length of the maximal subpart (always
 *               negative, in the range -1..-3). *codepoint is unchanged.
 *               The caller should advance by -return_value bytes before
 *               calling again.
 *
 * The maximal subpart of an ill-formed subsequence is defined by Unicode:
 * the longest prefix starting at the ill-formed offset that is either the
 * initial subsequence of a well-formed sequence, or a single code unit.
 * Each maximal subpart produces one U+FFFD substitution character.
 */
static inline int utf8_decode_next(const char* src,
                                   size_t len,
                                   uint32_t* codepoint) {
  if (len == 0)
    return 0;

  const uint8_t* bytes = (const uint8_t*)src;
  utf8_dfa_state_t state = UTF8_DFA_ACCEPT;
  uint32_t cp = 0;
  size_t pos = 0;

  do {
    state = utf8_dfa_step_decode(state, bytes[pos++], &cp);
    if (state == UTF8_DFA_ACCEPT) {
      *codepoint = cp;
      return (int)pos;
    }
    if (state == UTF8_DFA_REJECT) {
     /* The byte at bytes[pos-1] triggered rejection. If it was the first
      * byte, it is itself the maximal subpart (length 1). Otherwise
      * the lead byte(s) already consumed form the maximal subpart
      * and the triggering byte belongs to the next sequence. */
     return -(int)(pos > 1 ? pos - 1 : 1);
    }
  } while (pos < len);

  // Truncated sequence: maximal subpart is the bytes consumed so far
  return -(int)pos;
}

/*
 * utf8_decode_next_replace -- like utf8_decode_next but on error writes
 * U+FFFD to *codepoint and returns the maximal subpart length as a
 * positive value. Never returns a negative value. Returns 0 only when
 * len is 0.
 */
static inline int utf8_decode_next_replace(const char* src,
                                           size_t len,
                                           uint32_t* codepoint) {
  if (len == 0)
    return 0;

  const uint8_t* bytes = (const uint8_t*)src;
  utf8_dfa_state_t state = UTF8_DFA_ACCEPT;
  uint32_t cp = 0;
  size_t pos = 0;

  do {
    state = utf8_dfa_step_decode(state, bytes[pos++], &cp);
    if (state == UTF8_DFA_ACCEPT) {
      *codepoint = cp;
      return (int)pos;
    }
    if (state == UTF8_DFA_REJECT) {
      *codepoint = 0xFFFDu;
      return (int)(pos > 1 ? pos - 1 : 1);
    }
  } while (pos < len);

  *codepoint = 0xFFFDu;
  return (int)pos;
}

#ifdef __cplusplus
}
#endif
#endif /* UTF8_DECODE_NEXT_H */