1 /* This file is part of the YAZ toolkit.
2 * Copyright (C) Index Data
3 * See the file LICENSE for details.
7 * \brief ISO-8859-1 encoding / decoding
19 #include <yaz/xmalloc.h>
24 unsigned long compose_char;
33 { 'A', 0x0300, 0xc0}, /* LATIN CAPITAL LETTER A WITH GRAVE */
34 { 'A', 0x0301, 0xc1}, /* LATIN CAPITAL LETTER A WITH ACUTE */
35 { 'A', 0x0302, 0xc2}, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
36 { 'A', 0x0303, 0xc3}, /* LATIN CAPITAL LETTER A WITH TILDE */
37 { 'A', 0x0308, 0xc4}, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
38 { 'A', 0x030a, 0xc5}, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
39 /* no need for 0xc6 LATIN CAPITAL LETTER AE */
40 { 'C', 0x0327, 0xc7}, /* LATIN CAPITAL LETTER C WITH CEDILLA */
41 { 'E', 0x0300, 0xc8}, /* LATIN CAPITAL LETTER E WITH GRAVE */
42 { 'E', 0x0301, 0xc9}, /* LATIN CAPITAL LETTER E WITH ACUTE */
43 { 'E', 0x0302, 0xca}, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
44 { 'E', 0x0308, 0xcb}, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
45 { 'I', 0x0300, 0xcc}, /* LATIN CAPITAL LETTER I WITH GRAVE */
46 { 'I', 0x0301, 0xcd}, /* LATIN CAPITAL LETTER I WITH ACUTE */
47 { 'I', 0x0302, 0xce}, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
48 { 'I', 0x0308, 0xcf}, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
49 { 'N', 0x0303, 0xd1}, /* LATIN CAPITAL LETTER N WITH TILDE */
50 { 'O', 0x0300, 0xd2}, /* LATIN CAPITAL LETTER O WITH GRAVE */
51 { 'O', 0x0301, 0xd3}, /* LATIN CAPITAL LETTER O WITH ACUTE */
52 { 'O', 0x0302, 0xd4}, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
53 { 'O', 0x0303, 0xd5}, /* LATIN CAPITAL LETTER O WITH TILDE */
54 { 'O', 0x0308, 0xd6}, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
55 /* omitted: 0xd7 MULTIPLICATION SIGN */
56 /* omitted: 0xd8 LATIN CAPITAL LETTER O WITH STROKE */
57 { 'U', 0x0300, 0xd9}, /* LATIN CAPITAL LETTER U WITH GRAVE */
58 { 'U', 0x0301, 0xda}, /* LATIN CAPITAL LETTER U WITH ACUTE */
59 { 'U', 0x0302, 0xdb}, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
60 { 'U', 0x0308, 0xdc}, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
61 { 'Y', 0x0301, 0xdd}, /* LATIN CAPITAL LETTER Y WITH ACUTE */
62 /* omitted: 0xde LATIN CAPITAL LETTER THORN */
63 /* omitted: 0xdf LATIN SMALL LETTER SHARP S */
64 { 'a', 0x0300, 0xe0}, /* LATIN SMALL LETTER A WITH GRAVE */
65 { 'a', 0x0301, 0xe1}, /* LATIN SMALL LETTER A WITH ACUTE */
66 { 'a', 0x0302, 0xe2}, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
67 { 'a', 0x0303, 0xe3}, /* LATIN SMALL LETTER A WITH TILDE */
68 { 'a', 0x0308, 0xe4}, /* LATIN SMALL LETTER A WITH DIAERESIS */
69 { 'a', 0x030a, 0xe5}, /* LATIN SMALL LETTER A WITH RING ABOVE */
70 /* omitted: 0xe6 LATIN SMALL LETTER AE */
71 { 'c', 0x0327, 0xe7}, /* LATIN SMALL LETTER C WITH CEDILLA */
72 { 'e', 0x0300, 0xe8}, /* LATIN SMALL LETTER E WITH GRAVE */
73 { 'e', 0x0301, 0xe9}, /* LATIN SMALL LETTER E WITH ACUTE */
74 { 'e', 0x0302, 0xea}, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
75 { 'e', 0x0308, 0xeb}, /* LATIN SMALL LETTER E WITH DIAERESIS */
76 { 'i', 0x0300, 0xec}, /* LATIN SMALL LETTER I WITH GRAVE */
77 { 'i', 0x0301, 0xed}, /* LATIN SMALL LETTER I WITH ACUTE */
78 { 'i', 0x0302, 0xee}, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
79 { 'i', 0x0308, 0xef}, /* LATIN SMALL LETTER I WITH DIAERESIS */
80 /* omitted: 0xf0 LATIN SMALL LETTER ETH */
81 { 'n', 0x0303, 0xf1}, /* LATIN SMALL LETTER N WITH TILDE */
82 { 'o', 0x0300, 0xf2}, /* LATIN SMALL LETTER O WITH GRAVE */
83 { 'o', 0x0301, 0xf3}, /* LATIN SMALL LETTER O WITH ACUTE */
84 { 'o', 0x0302, 0xf4}, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
85 { 'o', 0x0303, 0xf5}, /* LATIN SMALL LETTER O WITH TILDE */
86 { 'o', 0x0308, 0xf6}, /* LATIN SMALL LETTER O WITH DIAERESIS */
87 /* omitted: 0xf7 DIVISION SIGN */
88 /* omitted: 0xf8 LATIN SMALL LETTER O WITH STROKE */
89 { 'u', 0x0300, 0xf9}, /* LATIN SMALL LETTER U WITH GRAVE */
90 { 'u', 0x0301, 0xfa}, /* LATIN SMALL LETTER U WITH ACUTE */
91 { 'u', 0x0302, 0xfb}, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
92 { 'u', 0x0308, 0xfc}, /* LATIN SMALL LETTER U WITH DIAERESIS */
93 { 'y', 0x0301, 0xfd}, /* LATIN SMALL LETTER Y WITH ACUTE */
94 /* omitted: 0xfe LATIN SMALL LETTER THORN */
95 { 'y', 0x0308, 0xff}, /* LATIN SMALL LETTER Y WITH DIAERESIS */
100 int yaz_iso_8859_1_lookup_y(unsigned long v,
101 unsigned long *x1, unsigned long *x2)
103 if (v >= 0xc0 && v <= 0xff) /* optimization. min and max .y values */
106 for (i = 0; latin1_comb[i].x1; i++)
108 if (v == latin1_comb[i].y)
110 *x1 = latin1_comb[i].x1;
111 *x2 = latin1_comb[i].x2;
119 int yaz_iso_8859_1_lookup_x12(unsigned long x1, unsigned long x2,
122 /* For MARC8s we try to get a Latin-1 page code out of it */
124 for (i = 0; latin1_comb[i].x1; i++)
125 if (x2 == latin1_comb[i].x2 && x1 == latin1_comb[i].x1)
127 *y = latin1_comb[i].y;
133 static size_t write_iso_8859_1(yaz_iconv_t cd, yaz_iconv_encoder_t e,
135 char **outbuf, size_t *outbytesleft)
137 struct encoder_data *w = (struct encoder_data *) e->data;
138 /* list of two char unicode sequence that, when combined, are
139 equivalent to single unicode chars that can be represented in
141 Regular iconv on Linux at least does not seem to convert these,
142 but since MARC-8 to UTF-8 generates these composed sequence
143 we get a better chance of a successful MARC-8 -> ISO-8859-1
145 unsigned char *outp = (unsigned char *) *outbuf;
150 for (i = 0; latin1_comb[i].x1; i++)
151 if (w->compose_char == latin1_comb[i].x1 && x == latin1_comb[i].x2)
153 x = latin1_comb[i].y;
156 if (*outbytesleft < 1)
157 { /* no room. Retain compose_char and bail out */
158 yaz_iconv_set_errno(cd, YAZ_ICONV_E2BIG);
161 if (!latin1_comb[i].x1)
162 { /* not found. Just write compose_char */
163 *outp++ = (unsigned char) w->compose_char;
165 *outbuf = (char *) outp;
167 /* compose_char used so reset it. x now holds current char */
171 if (x > 32 && x < 127 && w->compose_char == 0)
176 else if (x > 255 || x < 1)
178 yaz_iconv_set_errno(cd, YAZ_ICONV_EILSEQ);
181 else if (*outbytesleft < 1)
183 yaz_iconv_set_errno(cd, YAZ_ICONV_E2BIG);
186 *outp++ = (unsigned char) x;
188 *outbuf = (char *) outp;
192 static size_t flush_iso_8859_1(yaz_iconv_t cd, yaz_iconv_encoder_t e,
193 char **outbuf, size_t *outbytesleft)
195 struct encoder_data *w = (struct encoder_data *) e->data;
198 unsigned char *outp = (unsigned char *) *outbuf;
199 if (*outbytesleft < 1)
201 yaz_iconv_set_errno(cd, YAZ_ICONV_E2BIG);
204 *outp++ = (unsigned char) w->compose_char;
206 *outbuf = (char *) outp;
213 void init_iso_8859_1(yaz_iconv_encoder_t e)
215 struct encoder_data *w = (struct encoder_data *) e->data;
219 void destroy_iso_8859_1(yaz_iconv_encoder_t e)
224 yaz_iconv_encoder_t yaz_iso_8859_1_encoder(const char *tocode,
225 yaz_iconv_encoder_t e)
228 if (!yaz_matchstr(tocode, "iso88591"))
230 struct encoder_data *data = (struct encoder_data *)
231 xmalloc(sizeof(*data));
233 e->write_handle = write_iso_8859_1;
234 e->flush_handle = flush_iso_8859_1;
235 e->init_handle = init_iso_8859_1;
236 e->destroy_handle = destroy_iso_8859_1;
242 static unsigned long read_ISO8859_1(yaz_iconv_t cd,
243 yaz_iconv_decoder_t d,
245 size_t inbytesleft, size_t *no_read)
247 unsigned long x = inp[0];
252 yaz_iconv_decoder_t yaz_iso_8859_1_decoder(const char *fromcode,
253 yaz_iconv_decoder_t d)
256 if (!yaz_matchstr(fromcode, "iso88591"))
258 d->read_handle = read_ISO8859_1;
268 * c-file-style: "Stroustrup"
269 * indent-tabs-mode: nil
271 * vim: shiftwidth=4 tabstop=8 expandtab