From 5d13a8684c8637334d17f2b0577fc511aa275477 Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Wed, 23 May 2007 08:50:11 +0000 Subject: [PATCH] Check for malformed UTF-8 characters. Thanks for Gary Anderson for pointing this out. --- src/siconv.c | 98 ++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 55 insertions(+), 43 deletions(-) diff --git a/src/siconv.c b/src/siconv.c index bdb20ee..1420e78 100644 --- a/src/siconv.c +++ b/src/siconv.c @@ -2,7 +2,7 @@ * Copyright (C) 1995-2007, Index Data ApS * See the file LICENSE for details. * - * $Id: siconv.c,v 1.40 2007-05-03 22:20:45 adam Exp $ + * $Id: siconv.c,v 1.41 2007-05-23 08:50:11 adam Exp $ */ /** * \file siconv.c @@ -212,6 +212,7 @@ unsigned long yaz_read_UTF8_char(unsigned char *inp, { unsigned long x = 0; + *no_read = 0; /* by default */ if (inp[0] <= 0x7f) { x = inp[0]; @@ -219,75 +220,86 @@ unsigned long yaz_read_UTF8_char(unsigned char *inp, } else if (inp[0] <= 0xbf || inp[0] >= 0xfe) { - *no_read = 0; *error = YAZ_ICONV_EILSEQ; } else if (inp[0] <= 0xdf && inbytesleft >= 2) { - x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f); - if (x >= 0x80) - *no_read = 2; - else + if ((inp[1] & 0xc0) == 0x80) { - *no_read = 0; - *error = YAZ_ICONV_EILSEQ; + x = ((inp[0] & 0x1f) << 6) | (inp[1] & 0x3f); + if (x >= 0x80) + *no_read = 2; + else + *error = YAZ_ICONV_EILSEQ; } + else + *error = YAZ_ICONV_EILSEQ; } else if (inp[0] <= 0xef && inbytesleft >= 3) { - x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) | - (inp[2] & 0x3f); - if (x >= 0x800) - *no_read = 3; - else + if ((inp[1] & 0xc0) == 0x80 && (inp[2] & 0xc0) == 0x80) { - *no_read = 0; - *error = YAZ_ICONV_EILSEQ; + x = ((inp[0] & 0x0f) << 12) | ((inp[1] & 0x3f) << 6) | + (inp[2] & 0x3f); + if (x >= 0x800) + *no_read = 3; + else + *error = YAZ_ICONV_EILSEQ; } - } + else + *error = YAZ_ICONV_EILSEQ; + } else if (inp[0] <= 0xf7 && inbytesleft >= 4) { - x = ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) | - ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f); - if (x >= 0x10000) - *no_read = 4; - else + if ((inp[1] & 0xc0) == 0x80 && (inp[2] & 0xc0) == 0x80 + && (inp[3] & 0xc0) == 0x80) { - *no_read = 0; - *error = YAZ_ICONV_EILSEQ; + x = ((inp[0] & 0x07) << 18) | ((inp[1] & 0x3f) << 12) | + ((inp[2] & 0x3f) << 6) | (inp[3] & 0x3f); + if (x >= 0x10000) + *no_read = 4; + else + *error = YAZ_ICONV_EILSEQ; } + else + *error = YAZ_ICONV_EILSEQ; } else if (inp[0] <= 0xfb && inbytesleft >= 5) { - x = ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) | - ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) | - (inp[4] & 0x3f); - if (x >= 0x200000) - *no_read = 5; - else + if ((inp[1] & 0xc0) == 0x80 && (inp[2] & 0xc0) == 0x80 + && (inp[3] & 0xc0) == 0x80 && (inp[4] & 0xc0) == 0x80) { - *no_read = 0; - *error = YAZ_ICONV_EILSEQ; + x = ((inp[0] & 0x03) << 24) | ((inp[1] & 0x3f) << 18) | + ((inp[2] & 0x3f) << 12) | ((inp[3] & 0x3f) << 6) | + (inp[4] & 0x3f); + if (x >= 0x200000) + *no_read = 5; + else + *error = YAZ_ICONV_EILSEQ; } + else + *error = YAZ_ICONV_EILSEQ; } else if (inp[0] <= 0xfd && inbytesleft >= 6) { - x = ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) | - ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) | - ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f); - if (x >= 0x4000000) - *no_read = 6; - else + if ((inp[1] & 0xc0) == 0x80 && (inp[2] & 0xc0) == 0x80 + && (inp[3] & 0xc0) == 0x80 && (inp[4] & 0xc0) == 0x80 + && (inp[5] & 0xc0) == 0x80) { - *no_read = 0; - *error = YAZ_ICONV_EILSEQ; + x = ((inp[0] & 0x01) << 30) | ((inp[1] & 0x3f) << 24) | + ((inp[2] & 0x3f) << 18) | ((inp[3] & 0x3f) << 12) | + ((inp[4] & 0x3f) << 6) | (inp[5] & 0x3f); + if (x >= 0x4000000) + *no_read = 6; + else + *error = YAZ_ICONV_EILSEQ; } + else + *error = YAZ_ICONV_EILSEQ; } else - { - *no_read = 0; - *error = YAZ_ICONV_EINVAL; - } + *error = YAZ_ICONV_EINVAL; /* incomplete sentence */ + return x; } -- 1.7.10.4