2 # ============================================================================
3 # Zebra perl API header
4 # =============================================================================
7 # ============================================================================
8 package IDZebra::Filter;
11 use IDZebra::Logger qw(:flags :calls);
12 use Symbol qw(gensym);
18 IDZebra::init(); # ??? Do we need that at all (this is jus nmem init...)
22 # -----------------------------------------------------------------------------
24 # -----------------------------------------------------------------------------
26 my ($proto,$context) = @_;
27 my $class = ref($proto) || $proto;
29 $self->{context} = $context;
30 bless ($self, $class);
34 # -----------------------------------------------------------------------------
36 # -----------------------------------------------------------------------------
41 # print STDERR "LEAK",Devel::Leak::CheckSV($self->{dl}),"\n";
44 # print STDERR "LEAK",Devel::Leak::NoteSV($self->{dl}),"\n";
46 # This is ugly... could be passed as parameters... but didn't work.
48 my $dh = IDZebra::grs_perl_get_dh($self->{context});
49 my $mem = IDZebra::grs_perl_get_mem($self->{context});
50 my $d1 = IDZebra::Data1->get($dh,$mem);
54 eval {$rootnode = $self->process($d1)};
56 logf(LOG_WARN,"Error processing perl filter:%s\n",$@);
59 $rootnode = $self->process($d1);
61 IDZebra::grs_perl_set_res($self->{context},$rootnode);
66 my ($self, $buff) = @_;
67 $self->{_buff} = $buff;
70 # -----------------------------------------------------------------------------
71 # API Template - These methods should be overriden by the implementing class.
72 # -----------------------------------------------------------------------------
74 # This one is called once, when the module is loaded. Not in
75 # object context yet!!!
80 # Just going to return a root node.
81 return ($d1->mk_root('empty'));
84 # -----------------------------------------------------------------------------
86 # -----------------------------------------------------------------------------
88 my ($proto, $file, %args) = @_;
90 my $class = ref($proto) || $proto;
92 bless ($self, $class);
95 open ($th, $file) || croak ("Cannot open $file");
99 my $m = IDZebra::nmem_create();
100 my $d1=IDZebra::Data1->new($m,$IDZebra::DATA1_FLAG_XML);
101 if ($args{tabPath}) { $d1->tabpath($args{tabPath}); }
102 if ($args{tabRoot}) { $d1->tabroot($args{tabRoot}); }
104 my $rootnode = $self->process($d1);
105 $d1->pr_tree($rootnode);
106 $d1->free_tree($rootnode);
110 $self->{testh} = undef;
114 # -----------------------------------------------------------------------------
116 # -----------------------------------------------------------------------------
118 my ($self, $buff, $len, $offset) = @_;
120 if ($self->{testh}) {
121 return (read($self->{testh},$_[1],$len,$offset));
123 my $r = IDZebra::grs_perl_readf($self->{context},$len);
125 $buff = $self->{_buff};
126 $self->{_buff} = undef;
135 my $r = IDZebra::grs_perl_readline($self->{context});
137 my $buff = $self->{_buff};
138 $self->{_buff} = undef;
146 return(IDZebra::grs_perl_getc($self->{context}));
151 if ($self->{testh}) {
152 return ($self->{testh});
155 tie (*$fh,'IDZebra::FilterFile', $self);
161 my ($self, $buffsize) = @_;
166 if ($self->{testh}) {
167 $r = read($self->{testh}, $self->{_buff}, $buffsize);
169 $r = IDZebra::grs_perl_readf($self->{context},$buffsize);
172 $res .= $self->{_buff};
173 $self->{_buff} = undef;
181 my ($self, $offset) = @_;
182 if ($self->{testh}) {
183 # I'm not sure if offset is absolute or relative here...
184 return (seek ($self->{testh}, $offset, $0));
186 return (IDZebra::grs_perl_seekf($self->{context},$offset)) ;
192 if ($self->{testh}) {
195 return (IDZebra::grs_perl_seekf($self->{context}));
200 my ($self, $offset) = @_;
201 if ($self->{testh}) {
204 IDZebra::grs_perl_endf($self->{context},$offset);
207 # ----------------------------------------------------------------------------
208 # The 'virtual' filehandle for zebra extract calls
209 # ----------------------------------------------------------------------------
210 package IDZebra::FilterFile;
213 our @ISA = qw(Tie::Handle);
218 bless ($self, $class);
219 $self->{filter} = shift;
225 return ($self->{filter}->readf(@_));
230 return ($self->{filter}->readline());
235 return ($self->{filter}->getc());
239 croak ("EOF not implemented");
243 croak ("TELL not implemented");
247 croak ("SEEK not implemented");
259 IDZebra::Filter - A superclass of perl filters for Zebra
266 our @ISA=qw(IDZebra::Filter);
276 my $rootnode=$d1->mk_root('meta');
283 When Zebra is trying to index/present a record, needs to extract information from it's source. For some types of input, "hardcoded" procedures are defined, but you have the opportunity to write your own filter code in Tcl or in perl.
285 The perl implementation is nothing but a package, deployed in some available location for Zebra (in I<profilePath>, or in PERL_INCLUDE (@INC)). This package is interpreted once needed, a filter object is created, armored with knowledge enough, to read data from the source, and to generate data1 structures as result. For each individual source "files" the process method is called.
287 This class is supposed to be inherited in all perl filter classes, as it is providing a way of communication between the filter code and the indexing/retrieval process.
289 =head1 IMPLEMENTING FILTERS IN PERL
291 All you have to do is to create a perl package, using and inheriting this one (IDZebra::Filter), and implement the "process" method. The parameter of this call is an IDZebra::Data1 object, representing a data1 handle. You can use this, to reach the configuration information and to build your data structure. What you have to return is a data1 root node. To create it:
293 my $rootnode=$d1->mk_root('meta');
295 where 'meta' is the abstract syntax identifier (in this case Zebra will try to locate meta.abs, and apply it). Then just continue to build the structure. See i<IDZebra::Data1> for details.
297 In order to get the input stream, you can use "virtual" file operators (as the source is not necessairly a file):
301 =item B<readf($buf,$len,$offset)>
303 Going to read $len bytes of data from offset $offset into $buff
311 Get one character (byte)
313 =item B<readall($bufflen)>
315 Read the entire stream, by reading $bufflen bytes at once
317 =item B<seekf($offset)>
323 Tells the current offset (?)
325 =item B<endf($offset)>
331 You can optionally get a virtual perl filehandle as well:
333 my $fh = $self->get_fh();
338 Note, that the virtual filehandle implementation is not finished yet, so some applications may have problems using that. See TODO.
340 You can implement an init call for your class. This call is not going to be called in object, but in class context. Stupid, eh?
342 =head1 TEST YOUR PERL FILTER
344 You can check the functionality of your filter code, by writing a small test program like
348 $res =pod->test($ARGV[0],
349 (tabPath=>'.:../../tab:../../../yaz/tab'));
351 This will try to apply the filter on the file provided as argument, and display the generated data1 structure. However, there are some differences, when running a filter in test mode:
352 - The include path is not applied from tabPath
353 - the tellf, and endf functions are not implemented (just ignored)
355 =head1 CONFIGURE ZEBRA TO USE A PERL FILTER
357 This is quite simple. Read the Zebra manual, and follow the instructions to create your zebra.cfg. For your I<recordType> choose 'grs.perl.<YourFilterClass>'.
358 Copy your filter module (YourFilterClass.pm) to a directory listed in I<profilePath>. i<profilePath> is added to @INC, when interpreting your package: so if you need to load modules from different locations than the default perl include path, just add these directories.
362 By default, filter code (process method) is executed within an eval {} block, and only a warning is sent to the log, if there is an error. To turn this option off, set B<$IDZebra::Filter::SAFE_MODE> to B<0>;
366 Finish virtual (tied) filehandle methods (SEEK, EOF, TELL);
374 Peter Popovics, pop@technomat.hu
378 IDZebra, IDZebra::Data1, Zebra documentation