1 /* This file is part of the YAZ toolkit.
2 * Copyright (C) 1995-2008 Index Data
3 * See the file LICENSE for details.
7 * \brief ISO-8859-1 encoding / decoding
20 #include <yaz/xmalloc.h>
25 unsigned long compose_char;
34 { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
35 { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
36 { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
37 { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
38 { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
39 { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
40 /* no need for 0xc6 LATIN CAPITAL LETTER AE */
41 { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
42 { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
43 { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
44 { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
45 { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
46 { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
47 { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
48 { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
49 { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
50 { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
51 { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
52 { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
53 { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
54 { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
55 { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
56 /* omitted: 0xd7 MULTIPLICATION SIGN */
57 /* omitted: 0xd8 LATIN CAPITAL LETTER O WITH STROKE */
58 { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
59 { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
60 { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
61 { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
62 { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
63 /* omitted: 0xde LATIN CAPITAL LETTER THORN */
64 /* omitted: 0xdf LATIN SMALL LETTER SHARP S */
65 { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
66 { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
67 { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
68 { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
69 { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
70 { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
71 /* omitted: 0xe6 LATIN SMALL LETTER AE */
72 { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
73 { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
74 { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
75 { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
76 { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
77 { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
78 { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
79 { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
80 { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
81 /* omitted: 0xf0 LATIN SMALL LETTER ETH */
82 { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
83 { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
84 { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
85 { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
86 { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
87 { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
88 /* omitted: 0xf7 DIVISION SIGN */
89 /* omitted: 0xf8 LATIN SMALL LETTER O WITH STROKE */
90 { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
91 { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
92 { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
93 { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
94 { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
95 /* omitted: 0xfe LATIN SMALL LETTER THORN */
96 { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
101 int yaz_iso_8859_1_lookup_y(unsigned long v,
102 unsigned long *x1, unsigned long *x2)
104 if (v >= 0xc0 && v <= 0xff) /* optimization. min and max .y values */
107 for (i = 0; latin1_comb[i].x1; i++)
109 if (v == latin1_comb[i].y)
111 *x1 = latin1_comb[i].x1;
112 *x2 = latin1_comb[i].x2;
120 int yaz_iso_8859_1_lookup_x12(unsigned long x1, unsigned long x2,
123 /* For MARC8s we try to get a Latin-1 page code out of it */
125 for (i = 0; latin1_comb[i].x1; i++)
126 if (x2 == latin1_comb[i].x2 && x1 == latin1_comb[i].x1)
128 *y = latin1_comb[i].y;
134 static size_t write_iso_8859_1(yaz_iconv_t cd, yaz_iconv_encoder_t e,
136 char **outbuf, size_t *outbytesleft)
138 struct encoder_data *w = e->data;
139 /* list of two char unicode sequence that, when combined, are
140 equivalent to single unicode chars that can be represented in
142 Regular iconv on Linux at least does not seem to convert these,
143 but since MARC-8 to UTF-8 generates these composed sequence
144 we get a better chance of a successful MARC-8 -> ISO-8859-1
146 unsigned char *outp = (unsigned char *) *outbuf;
151 for (i = 0; latin1_comb[i].x1; i++)
152 if (w->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
154 x = latin1_comb[i].y;
157 if (*outbytesleft < 1)
158 { /* no room. Retain compose_char and bail out */
159 yaz_iconv_set_errno(cd, YAZ_ICONV_E2BIG);
162 if (!latin1_comb[i].x1)
163 { /* not found. Just write compose_char */
164 *outp++ = (unsigned char) w->compose_char;
166 *outbuf = (char *) outp;
168 /* compose_char used so reset it. x now holds current char */
172 if (x > 32 && x < 127 && w->compose_char == 0)
177 else if (x > 255 || x < 1)
179 yaz_iconv_set_errno(cd, YAZ_ICONV_EILSEQ);
182 else if (*outbytesleft < 1)
184 yaz_iconv_set_errno(cd, YAZ_ICONV_E2BIG);
187 *outp++ = (unsigned char) x;
189 *outbuf = (char *) outp;
193 static size_t flush_iso_8859_1(yaz_iconv_t cd, yaz_iconv_encoder_t e,
194 char **outbuf, size_t *outbytesleft)
196 struct encoder_data *w = e->data;
199 unsigned char *outp = (unsigned char *) *outbuf;
200 if (*outbytesleft < 1)
202 yaz_iconv_set_errno(cd, YAZ_ICONV_E2BIG);
205 *outp++ = (unsigned char) w->compose_char;
207 *outbuf = (char *) outp;
214 void init_iso_8859_1(yaz_iconv_encoder_t e)
216 struct encoder_data *w = e->data;
220 void destroy_iso_8859_1(yaz_iconv_encoder_t e)
225 yaz_iconv_encoder_t yaz_iso_8859_1_encoder(const char *tocode,
226 yaz_iconv_encoder_t e)
229 if (!yaz_matchstr(tocode, "iso88591"))
231 struct encoder_data *data = xmalloc(sizeof(*data));
233 e->write_handle = write_iso_8859_1;
234 e->flush_handle = flush_iso_8859_1;
235 e->init_handle = init_iso_8859_1;
236 e->destroy_handle = destroy_iso_8859_1;
242 static unsigned long read_ISO8859_1(yaz_iconv_t cd,
243 yaz_iconv_decoder_t d,
245 size_t inbytesleft, size_t *no_read)
247 unsigned long x = inp[0];
252 yaz_iconv_decoder_t yaz_iso_8859_1_decoder(const char *fromcode,
253 yaz_iconv_decoder_t d)
256 if (!yaz_matchstr(fromcode, "iso88591"))
258 d->read_handle = read_ISO8859_1;
268 * indent-tabs-mode: nil
270 * vim: shiftwidth=4 tabstop=8 expandtab