Continuation lines
[idzebra-moved-to-github.git] / index / invstat.c
1 /* $Id: invstat.c,v 1.35 2003-06-20 14:21:23 heikki Exp $
2    Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002
3    Index Data Aps
4
5 This file is part of the Zebra server.
6
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
10 version.
11
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15 for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with Zebra; see the file LICENSE.zebra.  If not, write to the
19 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
20 02111-1307, USA.
21 */
22
23
24 #include <stdio.h>
25 #include <assert.h>
26 #include <string.h>
27
28 #include "index.h"
29 #include "../isamc/isamd-p.h"
30
31 struct inv_stat_info {
32     ZebraHandle zh;
33     int no_isam_entries[9];
34     int no_dict_entries;
35     int no_dict_bytes;
36     int isam_bounds[20];
37     int isam_occurrences[20];
38     char tmp[128];
39     int isamb_levels[10][5];
40     int isamb_sizes[10];
41     int isamb_blocks[10];
42     unsigned long cksum;
43     int dumpwords;
44 };
45
46 #define SINGLETON_TYPE 8 /* the type to use for singletons that */ 
47                          /* have no block and no block type */
48
49 static void print_dict_item (ZebraMaps zm, const char *s, int count,
50             int firstsys, int firstseq, int lastsys, int lastseq )
51 {
52     int reg_type = s[1];
53     char keybuf[IT_MAX_WORD+1];
54     char *to = keybuf;
55     const char *from = s + 2;
56
57     while (*from)
58     {
59         const char *res = zebra_maps_output (zm, reg_type, &from);
60         if (!res)
61             *to++ = *from++;
62         else
63             while (*res)
64                 *to++ = *res++;
65     }
66     *to = '\0';
67     /* yaz_log (LOG_LOG, "%s", keybuf); */
68     printf("%10d %s %d.%d - %d.%d\n",count, keybuf,
69               firstsys,firstseq, lastsys,lastseq);
70 }
71
72 static int inv_stat_handle (char *name, const char *info, int pos,
73                             void *client)
74 {
75     int occur = 0;
76     int i = 0;
77     struct inv_stat_info *stat_info = (struct inv_stat_info*) client;
78     ISAMS_P isam_p;
79     int firstsys=-1;
80     int firstseq=-1;
81     int lastsys=-1;
82     int lastseq=-1;
83
84     stat_info->no_dict_entries++;
85     stat_info->no_dict_bytes += strlen(name);
86
87     if (!stat_info->zh->reg->isamd)
88     {
89         assert (*info == sizeof(ISAMS_P));
90         memcpy (&isam_p, info+1, sizeof(ISAMS_P));
91     }
92
93     if (stat_info->zh->reg->isams)
94     {
95         ISAMS_PP pp;
96         int occurx = 0;
97         struct it_key key;
98
99         pp = isams_pp_open (stat_info->zh->reg->isams, isam_p);
100         occur = isams_pp_num (pp);
101         while (isams_pp_read(pp, &key))
102         {
103             stat_info->cksum = stat_info->cksum * 65509 + 
104                 key.sysno + 11 * key.seqno;
105             occurx++;
106             if (-1==firstsys)
107             {
108                 firstseq=key.seqno;
109                 firstsys=key.sysno;
110             }
111             lastsys=key.sysno;
112             lastseq=key.seqno;
113         }
114         assert (occurx == occur);
115         stat_info->no_isam_entries[0] += occur;
116         isams_pp_close (pp);
117     }
118     if (stat_info->zh->reg->isam)
119     {
120         ISPT ispt;
121
122         ispt = is_position (stat_info->zh->reg->isam, isam_p);
123         occur = is_numkeys (ispt);
124         stat_info->no_isam_entries[is_type(isam_p)] += occur;
125         is_pt_free (ispt);
126     }
127     if (stat_info->zh->reg->isamc)
128     {
129         ISAMC_PP pp;
130         int occurx = 0;
131         struct it_key key;
132
133         pp = isc_pp_open (stat_info->zh->reg->isamc, isam_p);
134         occur = isc_pp_num (pp);
135         while (isc_pp_read(pp, &key))
136         {
137             stat_info->cksum = stat_info->cksum * 65509 + 
138                 key.sysno + 11 * key.seqno;
139             occurx++;
140             if (-1==firstsys)
141             {
142                 firstseq=key.seqno;
143                 firstsys=key.sysno;
144             }
145             lastsys=key.sysno;
146             lastseq=key.seqno;
147         }
148         assert (occurx == occur);
149         stat_info->no_isam_entries[isc_type(isam_p)] += occur;
150         isc_pp_close (pp);
151     }
152     if (stat_info->zh->reg->isamd)
153     {
154         ISAMD_PP pp;
155         int occurx = 0;
156         struct it_key key;
157         /* printf("[%d: %d %d %d %d %d %d] ", */
158         /*    info[0], info[1], info[2], info[3], info[4], info[5], info[7]);*/
159         pp = isamd_pp_open (stat_info->zh->reg->isamd, info+1, info[0]);
160         
161         occur = isamd_pp_num (pp);
162         while (isamd_pp_read(pp, &key))
163         {
164             stat_info->cksum = stat_info->cksum * 65509 + 
165                 key.sysno + 11 * key.seqno;
166             occurx++;
167             /* printf("%d.%d ", key.sysno, key.seqno); */ /*!*/
168             if (-1==firstsys)
169             {
170                 firstseq=key.seqno;
171                 firstsys=key.sysno;
172             }
173             lastsys=key.sysno;
174             lastseq=key.seqno;
175             if ( pp->is->method->debug >8 )
176                logf (LOG_LOG,"sysno=%d seqno=%d (%x/%x) oc=%d/%d ofs=%d ",
177                    key.sysno, key.seqno,
178                    key.sysno, key.seqno,
179                    occur,occurx, pp->offset);
180         }
181         /* printf("\n"); */ /*!*/
182 #ifdef SKIPTHIS
183         if ( pp->is->method->debug >7 )
184            logf(LOG_LOG,"item %d=%d:%d says %d keys, counted %d",
185               isam_p, isamd_type(isam_p), isamd_block(isam_p),
186               occur, occurx); 
187 #endif
188         if (occurx != occur) 
189           logf(LOG_LOG,"Count error!!! read %d, counted %d", occur, occurx);
190         assert (occurx == occur);
191         i = pp->cat;
192         if (info[1])
193             i=SINGLETON_TYPE;
194         stat_info->no_isam_entries[i] += occur;
195         isamd_pp_close (pp);
196     }
197     if (stat_info->zh->reg->isamb)
198     {
199         ISAMB_PP pp;
200         struct it_key key;
201         int cat = isam_p & 3;
202         int level;
203         int size;
204         int blocks;
205         
206         pp = isamb_pp_open_x(stat_info->zh->reg->isamb, isam_p, &level);
207
208         while (isamb_pp_read(pp, &key))
209         {
210             stat_info->cksum = stat_info->cksum * 65509 + 
211                 key.sysno + 11 * key.seqno;
212             occur++;
213             if (-1==firstsys)
214             {
215                 firstseq=key.seqno;
216                 firstsys=key.sysno;
217             }
218             lastsys=key.sysno;
219             lastseq=key.seqno;
220         }
221         isamb_pp_close_x (pp, &size, &blocks);
222         stat_info->isamb_blocks[cat] += blocks;
223         stat_info->isamb_sizes[cat] += size;
224         if (level > 4)
225             level = 4;
226         stat_info->isamb_levels[cat][level] ++;
227         stat_info->no_isam_entries[cat] += occur;
228     }
229     i=0;
230     while (occur > stat_info->isam_bounds[i] && stat_info->isam_bounds[i])
231         i++;
232     ++(stat_info->isam_occurrences[i]);
233     if (stat_info->dumpwords)
234        print_dict_item(stat_info->zh->reg->zebra_maps, name, occur,
235           firstsys,firstseq, lastsys, lastseq);
236     return 0;
237 }
238
239 int zebra_register_statistics (ZebraHandle zh, int dumpdict)
240 {
241     int blocks;
242     int size;
243     int count;
244     int i, prev;
245     int before = 0;
246     int occur;
247     int after = 1000000000;
248     struct inv_stat_info stat_info;
249     char term_dict[2*IT_MAX_WORD+2];
250
251     if (zebra_begin_read (zh))
252         return 1;
253
254     stat_info.zh = zh;
255     stat_info.dumpwords=dumpdict;
256
257     term_dict[0] = 1;
258     term_dict[1] = 0;
259
260     for (i = 0; i<=SINGLETON_TYPE; i++)
261         stat_info.no_isam_entries[i] = 0;
262     stat_info.no_dict_entries = 0;
263     stat_info.no_dict_bytes = 0;
264     stat_info.isam_bounds[0] = 1;
265     stat_info.isam_bounds[1] = 2;
266     stat_info.isam_bounds[2] = 3;
267     stat_info.isam_bounds[3] = 6;
268     stat_info.isam_bounds[4] = 10;
269     stat_info.isam_bounds[5] = 20;
270     stat_info.isam_bounds[6] = 30;
271     stat_info.isam_bounds[7] = 50;
272     stat_info.isam_bounds[8] = 100;
273     stat_info.isam_bounds[9] = 200;
274     stat_info.isam_bounds[10] = 5000;
275     stat_info.isam_bounds[11] = 10000;
276     stat_info.isam_bounds[12] = 20000;
277     stat_info.isam_bounds[13] = 50000;
278     stat_info.isam_bounds[14] = 100000;
279     stat_info.isam_bounds[15] = 200000;
280     stat_info.isam_bounds[16] = 500000;
281     stat_info.isam_bounds[17] = 1000000;
282     stat_info.isam_bounds[18] = 0;
283
284     stat_info.cksum = 0;
285
286     for (i = 0; i<20; i++)
287         stat_info.isam_occurrences[i] = 0;
288
289     for (i = 0; i<10; i++)
290     {
291         int j;
292         for (j = 0; j<5; j++)
293             stat_info.isamb_levels[i][j] = 0;
294         stat_info.isamb_sizes[i] = 0;
295         stat_info.isamb_blocks[i] = 0;
296     }
297
298     dict_scan (zh->reg->dict, term_dict, &before, &after, &stat_info,
299                inv_stat_handle);
300
301     if (zh->reg->isamc)
302     {
303         fprintf (stdout, "   Blocks    Occur  Size KB   Bytes/Entry\n");
304         for (i = 0; isc_block_used (zh->reg->isamc, i) >= 0; i++)
305         {
306             fprintf (stdout, " %8d %8d", isc_block_used (zh->reg->isamc, i),
307                      stat_info.no_isam_entries[i]);
308
309             if (stat_info.no_isam_entries[i])
310                 fprintf (stdout, " %8d   %f",
311                          (int) ((1023.0 + (double)
312                                  isc_block_used(zh->reg->isamc, i) *
313                                  isc_block_size(zh->reg->isamc,i))/1024),
314                          ((double) isc_block_used(zh->reg->isamc, i) *
315                           isc_block_size(zh->reg->isamc,i))/
316                          stat_info.no_isam_entries[i]);
317             fprintf (stdout, "\n");
318         }
319     }
320     if (zh->reg->isamd)
321     {
322         fprintf (stdout, "   Blocks   Occur      KB Bytes/Entry\n");
323         if (zh->reg->isamd->method->debug >0) 
324             logf(LOG_LOG,"   Blocks   Occur      KB Bytes/Entry");
325         for (i = 0; i<=SINGLETON_TYPE; i++)
326         {
327             blocks= isamd_block_used(zh->reg->isamd,i);
328             size= isamd_block_size(zh->reg->isamd,i);
329             count=stat_info.no_isam_entries[i];
330             if (i==SINGLETON_TYPE) 
331                 blocks=size=0;
332             if (stat_info.no_isam_entries[i]) 
333             {
334                 fprintf (stdout, "%c %7d %7d %7d %5.2f\n",
335                          (i==SINGLETON_TYPE)?('z'):('A'+i),
336                          blocks,
337                          count,
338                          (int) ((1023.0 + (double) blocks * size)/1024),
339                          ((double) blocks * size)/count);
340                 if (zh->reg->isamd->method->debug >0) 
341                     logf(LOG_LOG, "%c %7d %7d %7d %5.2f",
342                          (i==SINGLETON_TYPE)?('z'):('A'+i),
343                          blocks,
344                          count,
345                          (int) ((1023.0 + (double) blocks * size)/1024),
346                          ((double) blocks * size)/count);
347             } /* entries */
348         } /* for */
349     } /* isamd */
350     if ( (zh->reg->isamd) && (zh->reg->isamd->method->debug>0))
351         fprintf (stdout, "\n%d words using %d bytes\n",
352              stat_info.no_dict_entries, stat_info.no_dict_bytes);
353
354     if (zh->reg->isamb)
355     {
356         for (i = 0; i<4; i++)
357         {
358             int j;
359             int bsize = isamb_block_info(zh->reg->isamb, i);
360             if (bsize < 0)
361                 break;
362             fprintf (stdout, "Category   %d\n", i);
363             fprintf (stdout, "Block size %d\n", bsize);
364             fprintf (stdout, "Blocks:    %d\n", stat_info.isamb_blocks[i]);
365             fprintf (stdout, "Size:      %d\n", stat_info.isamb_sizes[i]);
366             fprintf (stdout, "Entries:   %d\n", stat_info.no_isam_entries[i]);
367             fprintf (stdout, "Total      %d\n", stat_info.isamb_blocks[i]*
368                      bsize);
369             for (j = 0; j<5; j++)
370                 if (stat_info.isamb_levels[i][j])
371                     fprintf (stdout, "Level%d     %d\n", j,
372                              stat_info.isamb_levels[i][j]);
373             fprintf (stdout, "\n");
374         }
375     }
376     fprintf (stdout, "Checksum       %08lX\n", stat_info.cksum);
377
378     fprintf (stdout, "Distinct words %d\n", stat_info.no_dict_entries);
379     occur = 0;
380     for (i = 0; i<9; i++)
381         occur += stat_info.no_isam_entries[i];
382     fprintf (stdout, "Word pos       %d\n", occur);
383     fprintf (stdout, "    Occurrences     Words\n");
384     prev = 1;
385     for (i = 0; stat_info.isam_bounds[i]; i++)
386     {
387         int here = stat_info.isam_bounds[i];
388         fprintf (stdout, "%7d-%-7d %7d\n",
389                  prev, here, stat_info.isam_occurrences[i]);
390         prev = here+1;
391     }
392     fprintf (stdout, "%7d-        %7d\n",
393              prev, stat_info.isam_occurrences[i]);
394     xmalloc_trav("unfreed"); /*! while hunting memory leaks */    
395     zebra_end_read (zh);
396     return 0;
397 }
398