http_rewrite: HTMLParser boilerplate
[metaproxy-moved-to-github.git] / src / filter_http_rewrite.cpp
1 /* This file is part of Metaproxy.
2    Copyright (C) 2005-2013 Index Data
3
4 Metaproxy is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8
9 Metaproxy is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 */
18
19 #include "config.hpp"
20 #include <metaproxy/filter.hpp>
21 #include <metaproxy/package.hpp>
22 #include <metaproxy/util.hpp>
23 #include "filter_http_rewrite.hpp"
24 #include "html_parser.hpp"
25
26 #include <yaz/zgdu.h>
27 #include <yaz/log.h>
28
29 #include <boost/regex.hpp>
30 #include <boost/lexical_cast.hpp>
31
32 #include <map>
33
34 namespace mp = metaproxy_1;
35 namespace yf = mp::filter;
36
37 namespace metaproxy_1 {
38     namespace filter {
39         class HttpRewrite::Replace {
40         public:
41             std::string regex;
42             std::string recipe;
43             std::map<int, std::string> group_index;
44             const std::string search_replace(
45                 std::map<std::string, std::string> & vars,
46                 const std::string & txt) const;
47             std::string sub_vars (
48                 const std::map<std::string, std::string> & vars) const;
49             void parse_groups();
50         };
51
52         class HttpRewrite::Rule {
53         public:
54             std::list<Replace> replace_list;
55             const std::string test_patterns(
56                 std::map<std::string, std::string> & vars,
57                 const std::string & txt) const;
58         };
59         class HttpRewrite::Within {
60         public:
61             std::string header;
62             std::string attr;
63             std::string tag;
64             RulePtr rule;
65         };
66
67         class HttpRewrite::Phase : public HTMLParserEvent {
68         public:
69             std::list<Within> within_list;
70             void rewrite_reqline(mp::odr & o, Z_HTTP_Request *hreq,
71                 std::map<std::string, std::string> & vars) const;
72             void rewrite_headers(mp::odr & o, Z_HTTP_Header *headers,
73                 std::map<std::string, std::string> & vars) const;
74             void rewrite_body(mp::odr & o,
75                 char **content_buf, int *content_len,
76                 std::map<std::string, std::string> & vars) const;
77             void openTagStart(const char *name);
78             void anyTagEnd(const char *name);
79             void attribute(const char *tagName, 
80                            const char *name, 
81                            const char *value,
82                            int val_len);
83             void closeTag(const char *name);
84             void text(const char *value, int len);
85         };
86     }
87 }
88
89 yf::HttpRewrite::HttpRewrite() :
90     req_phase(new Phase), res_phase(new Phase)
91 {
92 }
93
94 yf::HttpRewrite::~HttpRewrite()
95 {
96 }
97
98 void yf::HttpRewrite::process(mp::Package & package) const
99 {
100     yaz_log(YLOG_LOG, "HttpRewrite begins....");
101     Z_GDU *gdu = package.request().get();
102     //map of request/response vars
103     std::map<std::string, std::string> vars;
104     //we have an http req
105     if (gdu && gdu->which == Z_GDU_HTTP_Request)
106     {
107         Z_HTTP_Request *hreq = gdu->u.HTTP_Request;
108         mp::odr o;
109         req_phase->rewrite_reqline(o, hreq, vars);
110         yaz_log(YLOG_LOG, ">> Request headers");
111         req_phase->rewrite_headers(o, hreq->headers, vars);
112         req_phase->rewrite_body(o,
113                 &hreq->content_buf, &hreq->content_len, vars);
114         package.request() = gdu;
115     }
116     package.move();
117     gdu = package.response().get();
118     if (gdu && gdu->which == Z_GDU_HTTP_Response)
119     {
120         Z_HTTP_Response *hres = gdu->u.HTTP_Response;
121         yaz_log(YLOG_LOG, "Response code %d", hres->code);
122         mp::odr o;
123         yaz_log(YLOG_LOG, "<< Respose headers");
124         res_phase->rewrite_headers(o, hres->headers, vars);
125         res_phase->rewrite_body(o, &hres->content_buf,
126                 &hres->content_len, vars);
127         package.response() = gdu;
128     }
129 }
130
131 void yf::HttpRewrite::Phase::rewrite_reqline (mp::odr & o,
132         Z_HTTP_Request *hreq,
133         std::map<std::string, std::string> & vars) const
134 {
135     //rewrite the request line
136     std::string path;
137     if (strstr(hreq->path, "http://") == hreq->path)
138     {
139         yaz_log(YLOG_LOG, "Path in the method line is absolute, "
140             "possibly a proxy request");
141         path += hreq->path;
142     }
143     else
144     {
145         //TODO what about proto
146         path += "http://";
147         path += z_HTTP_header_lookup(hreq->headers, "Host");
148         path += hreq->path;
149     }
150
151     std::list<Within>::const_iterator it = within_list.begin();
152     if (it != within_list.end())
153     {
154         RulePtr rule = it->rule;
155
156         yaz_log(YLOG_LOG, "Proxy request URL is %s", path.c_str());
157         std::string npath = rule->test_patterns(vars, path);
158         if (!npath.empty())
159         {
160             yaz_log(YLOG_LOG, "Rewritten request URL is %s", npath.c_str());
161             hreq->path = odr_strdup(o, npath.c_str());
162         }
163     }
164 }
165
166 void yf::HttpRewrite::Phase::rewrite_headers(mp::odr & o,
167         Z_HTTP_Header *headers,
168         std::map<std::string, std::string> & vars) const
169 {
170     for (Z_HTTP_Header *header = headers;
171             header != 0;
172             header = header->next)
173     {
174         std::string sheader(header->name);
175         sheader += ": ";
176         sheader += header->value;
177         yaz_log(YLOG_LOG, "%s: %s", header->name, header->value);
178
179         std::list<Within>::const_iterator it = within_list.begin();
180         if (it == within_list.end())
181             continue;
182         RulePtr rule = it->rule;
183
184         std::string out = rule->test_patterns(vars, sheader);
185         if (!out.empty())
186         {
187             size_t pos = out.find(": ");
188             if (pos == std::string::npos)
189             {
190                 yaz_log(YLOG_LOG, "Header malformed during rewrite, ignoring");
191                 continue;
192             }
193             header->name = odr_strdup(o, out.substr(0, pos).c_str());
194             header->value = odr_strdup(o, out.substr(pos+2,
195                                                      std::string::npos).c_str());
196         }
197     }
198 }
199
200 void yf::HttpRewrite::Phase::rewrite_body(mp::odr & o,
201         char **content_buf,
202         int *content_len,
203         std::map<std::string, std::string> & vars) const
204 {
205     if (*content_buf)
206     {
207
208         std::list<Within>::const_iterator it = within_list.begin();
209         if (it != within_list.end())
210         {
211             RulePtr rule = it->rule;
212
213             std::string body(*content_buf);
214             std::string nbody = rule->test_patterns(vars, body);
215             if (!nbody.empty())
216             {
217                 *content_buf = odr_strdup(o, nbody.c_str());
218                 *content_len = nbody.size();
219             }
220         }
221     }
222 }
223
224
225 void yf::HttpRewrite::Phase::openTagStart(const char *name)
226 {
227 }
228
229 void yf::HttpRewrite::Phase::anyTagEnd(const char *name)
230 {
231 }
232
233 void yf::HttpRewrite::Phase::attribute(const char *tagName,
234                                        const char *name,
235                                        const char *value,
236                                        int val_len)
237 {
238 }
239
240
241 void yf::HttpRewrite::Phase::closeTag(const char *name)
242 {
243 }
244
245 void yf::HttpRewrite::Phase::text(const char *value, int len)
246 {
247 }
248
249
250 /**
251  * Tests pattern from the vector in order and executes recipe on
252  the first match.
253  */
254 const std::string yf::HttpRewrite::Rule::test_patterns(
255         std::map<std::string, std::string> & vars,
256         const std::string & txt) const
257 {
258     std::list<Replace>::const_iterator it = replace_list.begin();
259
260     for (; it != replace_list.end(); it++)
261     {
262         std::string out = it->search_replace(vars, txt);
263         if (!out.empty()) return out;
264     }
265     return "";
266 }
267
268 const std::string yf::HttpRewrite::Replace::search_replace(
269         std::map<std::string, std::string> & vars,
270         const std::string & txt) const
271 {
272     //exec regex against value
273     boost::regex re(regex);
274     boost::smatch what;
275     std::string::const_iterator start, end;
276     start = txt.begin();
277     end = txt.end();
278     std::string out;
279     while (regex_search(start, end, what, re)) //find next full match
280     {
281         size_t i;
282         for (i = 1; i < what.size(); ++i)
283         {
284             //check if the group is named
285             std::map<int, std::string>::const_iterator it
286                 = group_index.find(i);
287             if (it != group_index.end())
288             {   //it is
289                 if (!what[i].str().empty())
290                     vars[it->second] = what[i];
291             }
292
293         }
294         //prepare replacement string
295         std::string rvalue = sub_vars(vars);
296         yaz_log(YLOG_LOG, "! Rewritten '%s' to '%s'",
297                 what.str(0).c_str(), rvalue.c_str());
298         out.append(start, what[0].first);
299         out.append(rvalue);
300         start = what[0].second; //move search forward
301     }
302     //if we had a match cat the last part
303     if (start != txt.begin())
304         out.append(start, end);
305     return out;
306 }
307
308 void yf::HttpRewrite::Replace::parse_groups()
309 {
310     int gnum = 0;
311     bool esc = false;
312     const std::string & str = regex;
313     std::string res;
314     yaz_log(YLOG_LOG, "Parsing groups from '%s'", str.c_str());
315     for (size_t i = 0; i < str.size(); ++i)
316     {
317         res += str[i];
318         if (!esc && str[i] == '\\')
319         {
320             esc = true;
321             continue;
322         }
323         if (!esc && str[i] == '(') //group starts
324         {
325             gnum++;
326             if (i+1 < str.size() && str[i+1] == '?') //group with attrs
327             {
328                 i++;
329                 if (i+1 < str.size() && str[i+1] == ':') //non-capturing
330                 {
331                     if (gnum > 0) gnum--;
332                     res += str[i];
333                     i++;
334                     res += str[i];
335                     continue;
336                 }
337                 if (i+1 < str.size() && str[i+1] == 'P') //optional, python
338                     i++;
339                 if (i+1 < str.size() && str[i+1] == '<') //named
340                 {
341                     i++;
342                     std::string gname;
343                     bool term = false;
344                     while (++i < str.size())
345                     {
346                         if (str[i] == '>') { term = true; break; }
347                         if (!isalnum(str[i]))
348                             throw mp::filter::FilterException
349                                 ("Only alphanumeric chars allowed, found "
350                                  " in '"
351                                  + str
352                                  + "' at "
353                                  + boost::lexical_cast<std::string>(i));
354                         gname += str[i];
355                     }
356                     if (!term)
357                         throw mp::filter::FilterException
358                             ("Unterminated group name '" + gname
359                              + " in '" + str +"'");
360                     group_index[gnum] = gname;
361                     yaz_log(YLOG_LOG, "Found named group '%s' at $%d",
362                             gname.c_str(), gnum);
363                 }
364             }
365         }
366         esc = false;
367     }
368     regex = res;
369 }
370
371 std::string yf::HttpRewrite::Replace::sub_vars (
372         const std::map<std::string, std::string> & vars) const
373 {
374     std::string out;
375     bool esc = false;
376     const std::string & in = recipe;
377     for (size_t i = 0; i < in.size(); ++i)
378     {
379         if (!esc && in[i] == '\\')
380         {
381             esc = true;
382             continue;
383         }
384         if (!esc && in[i] == '$') //var
385         {
386             if (i+1 < in.size() && in[i+1] == '{') //ref prefix
387             {
388                 ++i;
389                 std::string name;
390                 bool term = false;
391                 while (++i < in.size())
392                 {
393                     if (in[i] == '}') { term = true; break; }
394                     name += in[i];
395                 }
396                 if (!term) throw mp::filter::FilterException
397                     ("Unterminated var ref in '"+in+"' at "
398                      + boost::lexical_cast<std::string>(i));
399                 std::map<std::string, std::string>::const_iterator it
400                     = vars.find(name);
401                 if (it != vars.end())
402                 {
403                     out += it->second;
404                 }
405             }
406             else
407             {
408                 throw mp::filter::FilterException
409                     ("Malformed or trimmed var ref in '"
410                      +in+"' at "+boost::lexical_cast<std::string>(i));
411             }
412             continue;
413         }
414         //passthru
415         out += in[i];
416         esc = false;
417     }
418     return out;
419 }
420
421
422 void yf::HttpRewrite::configure_phase(const xmlNode *ptr, Phase &phase)
423 {
424     std::map<std::string, RulePtr > rules;
425     for (ptr = ptr->children; ptr; ptr = ptr->next)
426     {
427         if (ptr->type != XML_ELEMENT_NODE)
428             continue;
429         else if (!strcmp((const char *) ptr->name, "rule"))
430         {
431             static const char *names[2] = { "name", 0 };
432             std::string values[1];
433             values[0] = "default";
434             mp::xml::parse_attr(ptr, names, values);
435
436             RulePtr rule(new Rule);
437             for (xmlNode *p = ptr->children; p; p = p->next)
438             {
439                 if (p->type != XML_ELEMENT_NODE)
440                     continue;
441                 if (!strcmp((const char *) p->name, "rewrite"))
442                 {
443                     Replace replace;
444                     const struct _xmlAttr *attr;
445                     for (attr = p->properties; attr; attr = attr->next)
446                     {
447                         if (!strcmp((const char *) attr->name,  "from"))
448                             replace.regex = mp::xml::get_text(attr->children);
449                         else if (!strcmp((const char *) attr->name,  "to"))
450                             replace.recipe = mp::xml::get_text(attr->children);
451                         else
452                             throw mp::filter::FilterException
453                                 ("Bad attribute "
454                                  + std::string((const char *) attr->name)
455                                  + " in rewrite section of http_rewrite");
456                     }
457                     yaz_log(YLOG_LOG, "Found rewrite rule from '%s' to '%s'",
458                             replace.regex.c_str(), replace.recipe.c_str());
459                     replace.parse_groups();
460                     if (!replace.regex.empty())
461                         rule->replace_list.push_back(replace);
462                 }
463                 else
464                     throw mp::filter::FilterException
465                         ("Bad element "
466                          + std::string((const char *) p->name)
467                          + " in http_rewrite filter");
468             }
469             if (!rule->replace_list.empty())
470                 rules[values[0]] = rule;
471         }
472         else if (!strcmp((const char *) ptr->name, "within"))
473         {
474             static const char *names[5] =
475                 { "header", "attr", "tag", "rule", 0 };
476             std::string values[4];
477             mp::xml::parse_attr(ptr, names, values);
478             Within w;
479             w.header = values[0];
480             w.attr = values[1];
481             w.tag = values[2];
482             std::map<std::string,RulePtr>::const_iterator it =
483                 rules.find(values[3]);
484             if (it == rules.end())
485                 throw mp::filter::FilterException
486                     ("Reference to non-existing rule '" + values[3] +
487                      "' in http_rewrite filter");
488             w.rule = it->second;
489             phase.within_list.push_back(w);
490         }
491         else
492         {
493             throw mp::filter::FilterException
494                 ("Bad element "
495                  + std::string((const char *) ptr->name)
496                  + " in http_rewrite filter");
497         }
498     }
499 }
500
501 void yf::HttpRewrite::configure(const xmlNode * ptr, bool test_only,
502         const char *path)
503 {
504     for (ptr = ptr->children; ptr; ptr = ptr->next)
505     {
506         if (ptr->type != XML_ELEMENT_NODE)
507             continue;
508         else if (!strcmp((const char *) ptr->name, "request"))
509         {
510             configure_phase(ptr, *req_phase);
511         }
512         else if (!strcmp((const char *) ptr->name, "response"))
513         {
514             configure_phase(ptr, *res_phase);
515         }
516         else
517         {
518             throw mp::filter::FilterException
519                 ("Bad element "
520                  + std::string((const char *) ptr->name)
521                  + " in http_rewrite1 filter");
522         }
523     }
524 }
525
526 static mp::filter::Base* filter_creator()
527 {
528     return new mp::filter::HttpRewrite;
529 }
530
531 extern "C" {
532     struct metaproxy_1_filter_struct metaproxy_1_filter_http_rewrite = {
533         0,
534         "http_rewrite",
535         filter_creator
536     };
537 }
538
539
540 /*
541  * Local variables:
542  * c-basic-offset: 4
543  * c-file-style: "Stroustrup"
544  * indent-tabs-mode: nil
545  * End:
546  * vim: shiftwidth=4 tabstop=8 expandtab
547  */
548