|
Line 0
Link Here
|
|
|
1 |
/* |
| 2 |
* Copyright (c) 2013 Damien Miller <djm@mindrot.org> |
| 3 |
* |
| 4 |
* Permission to use, copy, modify, and distribute this software for any |
| 5 |
* purpose with or without fee is hereby granted, provided that the above |
| 6 |
* copyright notice and this permission notice appear in all copies. |
| 7 |
* |
| 8 |
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
| 9 |
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
| 10 |
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
| 11 |
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
| 12 |
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
| 13 |
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
| 14 |
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
| 15 |
*/ |
| 16 |
|
| 17 |
/* |
| 18 |
* This is a simple RFC3454 stringprep profile to sanitise UTF-8 strings |
| 19 |
* from untrusted sources. |
| 20 |
* |
| 21 |
* It is intended to be used prior to display of untrusted strings only. |
| 22 |
* It should not be used for logging because of bi-di ambiguity. It |
| 23 |
* should also not be used in any case where lack of normalisation may |
| 24 |
* cause problems. |
| 25 |
* |
| 26 |
* This profile uses the prohibition and mapping tables from RFC3454 |
| 27 |
* (listed below) but the unassigned character table has been updated to |
| 28 |
* Unicode 6.2. It uses a local whitelist of whitespace characters (\n, |
| 29 |
* \a and \t). Unicode normalisation and bi-di testing are not used. |
| 30 |
* |
| 31 |
* XXX: implement bi-di handling (needed for logs) |
| 32 |
* XXX: implement KC normalisation (needed for passing to libs/syscalls) |
| 33 |
*/ |
| 34 |
|
| 35 |
#include <sys/types.h> |
| 36 |
#include <stdio.h> |
| 37 |
#include <stdlib.h> |
| 38 |
#include <string.h> |
| 39 |
#include <limits.h> |
| 40 |
#include <ctype.h> |
| 41 |
|
| 42 |
#include "misc.h" |
| 43 |
|
| 44 |
struct u32_range { |
| 45 |
u_int32_t lo, hi; /* Inclusive */ |
| 46 |
}; |
| 47 |
|
| 48 |
#include "stringprep-tables.c" |
| 49 |
|
| 50 |
/* Returns 1 if code 'c' appears in the table or 0 otherwise */ |
| 51 |
static int |
| 52 |
code_in_table(u_int32_t c, const struct u32_range *table, size_t tlen) |
| 53 |
{ |
| 54 |
const struct u32_range *e, *end = (void *)(tlen + (char *)table); |
| 55 |
|
| 56 |
for (e = table; e < end; e++) { |
| 57 |
if (c >= e->lo && c <= e->hi) |
| 58 |
return 1; |
| 59 |
} |
| 60 |
return 0; |
| 61 |
} |
| 62 |
|
| 63 |
/* |
| 64 |
* Decode the next valid UCS character from a UTF-8 string, skipping past bad |
| 65 |
* codes. Returns the decoded character or 0 for end-of-string and updates |
| 66 |
* nextc to point to the start of the next character (if any). |
| 67 |
* had_error is set if an invalid code was encountered. |
| 68 |
*/ |
| 69 |
static u_int32_t |
| 70 |
decode_utf8(const char *in, const char **nextc, int *had_error) |
| 71 |
{ |
| 72 |
int state = 0; |
| 73 |
size_t i; |
| 74 |
u_int32_t c, e; |
| 75 |
|
| 76 |
e = c = 0; |
| 77 |
for (i = 0; in[i] != '\0'; i++) { |
| 78 |
e = (u_char)in[i]; |
| 79 |
/* Invalid code point state */ |
| 80 |
if (state == -1) { |
| 81 |
/* |
| 82 |
* Continue eating continuation characters until |
| 83 |
* a new start character comes along. |
| 84 |
*/ |
| 85 |
if ((e & 0xc0) == 0x80) |
| 86 |
continue; |
| 87 |
state = 0; |
| 88 |
} |
| 89 |
|
| 90 |
/* New code point state */ |
| 91 |
if (state == 0) { |
| 92 |
if ((e & 0x80) == 0) { /* 7 bit code */ |
| 93 |
c = e & 0x7f; |
| 94 |
goto have_code; |
| 95 |
} else if ((e & 0xe0) == 0xc0) { /* 11 bit code point */ |
| 96 |
state = 1; |
| 97 |
c = (e & 0x1f) << 6; |
| 98 |
} else if ((e & 0xf0) == 0xe0) { /* 16 bit code point */ |
| 99 |
state = 2; |
| 100 |
c = (e & 0xf) << 12; |
| 101 |
} else if ((e & 0xf8) == 0xf0) { /* 21 bit code point */ |
| 102 |
state = 3; |
| 103 |
c = (e & 0x7) << 18; |
| 104 |
} else { |
| 105 |
/* A five or six byte header, or 0xff */ |
| 106 |
goto bad_encoding; |
| 107 |
} |
| 108 |
/* |
| 109 |
* Check that the header byte has some non-zero data |
| 110 |
* after masking off the length marker. If not it is |
| 111 |
* an invalid encoding. |
| 112 |
*/ |
| 113 |
if (c == 0) { |
| 114 |
bad_encoding: |
| 115 |
c = 0; |
| 116 |
state = -1; |
| 117 |
if (had_error != NULL) |
| 118 |
*had_error = 1; |
| 119 |
} |
| 120 |
continue; |
| 121 |
} |
| 122 |
|
| 123 |
/* Sanity check: should never happen */ |
| 124 |
if (state < 1 || state > 5) { |
| 125 |
*nextc = NULL; |
| 126 |
if (had_error != NULL) |
| 127 |
*had_error = 1; |
| 128 |
return 0; |
| 129 |
} |
| 130 |
/* Multibyte code point state */ |
| 131 |
state--; |
| 132 |
c |= (e & 0x3f) << (state * 6); |
| 133 |
if (state > 0) |
| 134 |
continue; |
| 135 |
|
| 136 |
/* RFC3629 bans codepoints > U+10FFFF */ |
| 137 |
if (c > 0x10FFFF) { |
| 138 |
if (had_error != NULL) |
| 139 |
*had_error = 1; |
| 140 |
continue; |
| 141 |
} |
| 142 |
have_code: |
| 143 |
*nextc = in + i + 1; |
| 144 |
return c; |
| 145 |
} |
| 146 |
if (state != 0 && had_error != NULL) |
| 147 |
*had_error = 1; |
| 148 |
*nextc = in + i; |
| 149 |
return 0; |
| 150 |
} |
| 151 |
|
| 152 |
/* |
| 153 |
* Attempt to encode a UCS character as a UTF-8 sequence. Returns the number |
| 154 |
* of characters used or -1 on error (insufficient space or bad code). |
| 155 |
*/ |
| 156 |
static int |
| 157 |
encode_utf8(u_int32_t c, char *s, size_t slen) |
| 158 |
{ |
| 159 |
size_t i, need; |
| 160 |
u_char h; |
| 161 |
|
| 162 |
if (c < 0x80) { |
| 163 |
if (slen >= 1) { |
| 164 |
s[0] = (char)c; |
| 165 |
} |
| 166 |
return 1; |
| 167 |
} else if (c < 0x800) { |
| 168 |
need = 2; |
| 169 |
h = 0xc0; |
| 170 |
} else if (c < 0x10000) { |
| 171 |
need = 3; |
| 172 |
h = 0xe0; |
| 173 |
} else if (c < 0x200000) { |
| 174 |
need = 4; |
| 175 |
h = 0xf0; |
| 176 |
} else { |
| 177 |
/* Invalid code point > U+10FFFF */ |
| 178 |
return -1; |
| 179 |
} |
| 180 |
if (need > slen) |
| 181 |
return -1; |
| 182 |
for (i = 0; i < need; i++) { |
| 183 |
s[i] = (i == 0 ? h : 0x80); |
| 184 |
s[i] |= (c >> (need - i - 1) * 6) & 0x3f; |
| 185 |
} |
| 186 |
return need; |
| 187 |
} |
| 188 |
|
| 189 |
|
| 190 |
/* |
| 191 |
* Normalise a UTF-8 string using the RFC3454 stringprep algorithm. |
| 192 |
* Returns 0 on success or -1 on failure (prohibited code or insufficient |
| 193 |
* length in the output string. |
| 194 |
* Requires an output buffer at most the same length as the input. |
| 195 |
*/ |
| 196 |
int |
| 197 |
utf8_stringprep(const char *in, char *out, size_t olen) |
| 198 |
{ |
| 199 |
int r; |
| 200 |
size_t o; |
| 201 |
u_int32_t c; |
| 202 |
|
| 203 |
if (olen < 1) |
| 204 |
return -1; |
| 205 |
|
| 206 |
for (o = 0; (c = decode_utf8(in, &in, NULL)) != 0;) { |
| 207 |
/* Mapping */ |
| 208 |
if (code_in_table(c, map_to_nothing, sizeof(map_to_nothing))) |
| 209 |
continue; |
| 210 |
|
| 211 |
/* Prohibitied output */ |
| 212 |
if (code_in_table(c, prohibited, sizeof(prohibited)) && |
| 213 |
!code_in_table(c, whitelist, sizeof(whitelist))) |
| 214 |
return -1; |
| 215 |
|
| 216 |
/* Map unassigned code points to U+FFFD */ |
| 217 |
if (code_in_table(c, unassigned, sizeof(unassigned))) |
| 218 |
c = 0xFFFD; |
| 219 |
|
| 220 |
/* Encode the character */ |
| 221 |
r = encode_utf8(c, out + o, olen - o - 1); |
| 222 |
if (r < 0) |
| 223 |
return -1; |
| 224 |
o += r; |
| 225 |
} |
| 226 |
out[o] = '\0'; |
| 227 |
return 0; |
| 228 |
} |
| 229 |
|