727ab59 by Pascal Bleser at 2010-07-23 1
#!/usr/bin/perl
2
# vim: set ai et sw=3 ts=3 nu:
3
#
4
# Updates Solr with repository metadata
5
#
6
# by Pascal Bleser <pascal.bleser@opensuse.org>
7
#
8
#     This library is free software; you can redistribute it and/or modify it
9
#     under the terms of the GNU Lesser General Public License as published by
10
#     the Free Software Foundation; either version 2.1 of the License, or (at
11
#     your option) any later version.
12
#                 
13
#     This library is distributed in the hope that it will be useful, but
14
#     WITHOUT ANY WARRANTY; without even the implied warranty of
15
#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
#     Lesser General Public License for more details.
17
#      
18
#     You should have received a copy of the GNU Lesser General Public
19
#     License along with this library; if not, write to the Free Software
20
#     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307,
21
#     USA.
22
23
use strict;
24
use warnings;
25
use LWP::UserAgent;
26
use HTTP::Date;
27
use File::Spec;
28
use File::Basename;
29
use HTML::Entities ();
30
use XML::LibXML;
31
use POSIX;
32
use IO::Uncompress::Gunzip;
33
use Term::ProgressBar;
34
use WebService::Solr;
35
use Getopt::Long;
36
caf8fe6 by Pascal Bleser at 2010-07-28 37
use lib './lib';
38
use RPM_MD;
39
use YaST2_MD;
40
727ab59 by Pascal Bleser at 2010-07-23 41
my $repos = "./repos.d";
42
my $cache_dir = "./cache.d";
43
my $verbose = undef;
b99c742 by Pascal Bleser at 2010-08-05 44
my $force = undef;
a19b762 by Pascal Bleser at 2010-08-08 45
my $validate_introspect = undef;
727ab59 by Pascal Bleser at 2010-07-23 46
47
GetOptions(
48
   'v|verbose' => \$verbose,
b99c742 by Pascal Bleser at 2010-08-05 49
   'f|force'   => \$force,
727ab59 by Pascal Bleser at 2010-07-23 50
);
51
52
my $solr_escape_chars = quotemeta( '+-&|!(){}[]^"~*?:\\' );
53
my @repos = ();
54
55
my @rfiles = ();
56
if (scalar(@ARGV) > 0) {
57
   push(@rfiles, @ARGV);
58
} else {
59
   @rfiles = grep { -f } glob($repos.'/*.conf');
60
}
61
62
foreach my $rfile (@rfiles) {
63
   open(my $fh, '<', $rfile) or die "failed to open $rfile: $!";
64
   while (<$fh>) {
65
      chomp;
66
      s/#.*$//;
67
      s/^\s*//;
68
      s/\s*//;
69
      next if /^$/;
b99c742 by Pascal Bleser at 2010-08-05 70
      if (/^(\S+)\s+(\S+)\s+(\S+)(?:\s+(\S+))?(?:\s+(\S+))?$/) {
caf8fe6 by Pascal Bleser at 2010-07-28 71
         my $r = {
72
            repoid         => $1,
b99c742 by Pascal Bleser at 2010-08-05 73
            distribution   => $2,
74
            baseurl        => $3,
ba00e9a by Pascal Bleser at 2010-07-29 75
            configfile     => $rfile,
caf8fe6 by Pascal Bleser at 2010-07-28 76
         };
b99c742 by Pascal Bleser at 2010-08-05 77
         $r->{mdtype} = defined $4 ? $4 : 'rpmmd';
78
         if (defined $5) {
79
            my %flags = map { $_ => 1 } split(/\s*,\s*/, $5);
80
            $r->{flags} = \%flags;
81
         } else {
82
            $r->{flags} = {};
83
         }
84
caf8fe6 by Pascal Bleser at 2010-07-28 85
         push(@repos, $r);
86
      } else {
87
         die "invalid repo spec in $rfile at line $.";
88
      }
727ab59 by Pascal Bleser at 2010-07-23 89
   }
90
   close($fh);
91
}
92
93
my $ua = LWP::UserAgent->new(
94
   timeout      => 10,
95
   agent        => "webpin-repomanager/1.0",
96
   max_redirect => 4,
97
);
98
$ua->env_proxy();
99
100
my $solr = WebService::Solr->new("http://localhost:8983/solr", {
101
    autocommit => 0,
102
});
103
$solr->ping() or die "failed to ping Solr";
104
105
sub f($$) {
106
    my $name = shift;
107
    my $value = shift;
108
    my $field = WebService::Solr::Field->new($name => $value);
109
    return $field;
110
}
111
112
sub solr_escape($) {
113
   my $v = shift;
a19b762 by Pascal Bleser at 2010-08-08 114
   die "null value passed to solr_escape" unless defined $v;
727ab59 by Pascal Bleser at 2010-07-23 115
   $v =~ s{([$solr_escape_chars])}{\\$1}g;
116
   return $v;
117
}
118
ba00e9a by Pascal Bleser at 2010-07-29 119
my @gone = ();
ac9c69e by Pascal Bleser at 2010-07-28 120
my $total = 0;
727ab59 by Pascal Bleser at 2010-07-23 121
foreach my $r (@repos) {
122
   print $r->{repoid}, "\n" if $verbose;
123
124
   my $cache = File::Spec->catfile($cache_dir, $r->{repoid}.".cache");
125
   {
126
      my $dir = dirname($cache);
127
      mkdir($dir, 0750) unless -d $dir;
128
   }
129
130
   my $timestamp = undef;
131
   my $last_modified = undef;
132
   my $etag = undef;
133
   {
134
      if (-e $cache) {
135
         open(my $fh, '<', $cache) or die "failed to open cache file $cache: $!";
136
         chomp($timestamp = <$fh>);
137
         chomp($last_modified = <$fh>);
138
         chomp($etag = <$fh>);
139
         close($fh);
140
      }
141
   }
142
caf8fe6 by Pascal Bleser at 2010-07-28 143
   my $h = {
144
      last_modified => $last_modified,
145
      etag          => $etag,
146
      timestamp     => $timestamp,
147
   };
727ab59 by Pascal Bleser at 2010-07-23 148
caf8fe6 by Pascal Bleser at 2010-07-28 149
   my @docs = ();
150
   my $packages = undef;
151
   my $repoheaders = undef;
152
   {
ba00e9a by Pascal Bleser at 2010-07-29 153
      my $pr = undef;
b99c742 by Pascal Bleser at 2010-08-05 154
      if ($r->{mdtype} eq 'rpmmd' or $r->{mdtype} eq 'rpm-md') {
caf8fe6 by Pascal Bleser at 2010-07-28 155
         eval {
ba00e9a by Pascal Bleser at 2010-07-29 156
            $pr = parse_rpmmd($r, $h, $ua, $verbose);
157
            #($packages, $repoheaders) = parse_rpmmd($r, $h, $ua, $verbose);
caf8fe6 by Pascal Bleser at 2010-07-28 158
         };
159
         if ($@) {
160
            warn "failed to parse repository ".$r->{repoid}.": ".$@;
161
            next;
162
         }
163
      } elsif ($r->{mdtype} eq 'yast2') {
164
         eval {
ba00e9a by Pascal Bleser at 2010-07-29 165
            $pr = parse_y2md($r, $h, $ua, $verbose);
caf8fe6 by Pascal Bleser at 2010-07-28 166
         };
167
         if ($@) {
168
            warn "failed to parse repository ".$r->{repoid}.": ".$@;
169
            next;
727ab59 by Pascal Bleser at 2010-07-23 170
         }
caf8fe6 by Pascal Bleser at 2010-07-28 171
      } else {
172
         warn "unsupported repository type \"".$r->{mdtype}."\"";
173
         next;
727ab59 by Pascal Bleser at 2010-07-23 174
      }
175
ba00e9a by Pascal Bleser at 2010-07-29 176
      if (ref($pr) eq 'ARRAY') {
177
         $packages = $pr->[0];
178
         $repoheaders = $pr->[1];
179
      } elsif (ref($pr) eq 'HASH') {
180
         push(@gone, $r);
181
         next;
182
      } elsif (not defined($pr)) {
183
         next;
184
      } else {
185
         warn "unsupported scalar returned by parser: $pr";
caf8fe6 by Pascal Bleser at 2010-07-28 186
         next;
727ab59 by Pascal Bleser at 2010-07-23 187
      }
188
a19b762 by Pascal Bleser at 2010-08-08 189
      if ($validate_introspect) {
190
         foreach my $p (@$packages) {
191
            die "missing summary in ".join('-', map { $p->{$_} } qw(name version release arch)) unless exists $p->{summary};
192
            while (my ($k, $v) = each(%$p)) {
193
               die "found undef for $k in package ".join('-', map { $p->{$_} } qw(name version release arch)) unless defined $v;
194
               if (ref($v) eq 'ARRAY') {
195
                  foreach (@$v) {
196
                     die "found undef in list $k in package ".join('-', map { $p->{$_} } qw(name version release arch)) unless defined $_;
197
                  }
caf8fe6 by Pascal Bleser at 2010-07-28 198
               }
199
            }
200
         }
a19b762 by Pascal Bleser at 2010-08-08 201
         foreach my $p (@$packages) {
202
            if (not exists $p->{id} or not defined $p->{id}) {
203
               use Data::Dumper;
204
               die "no id: " . Dumper($p);
205
            }
206
         }
caf8fe6 by Pascal Bleser at 2010-07-28 207
      }
727ab59 by Pascal Bleser at 2010-07-23 208
a19b762 by Pascal Bleser at 2010-08-08 209
      my %source_rpm_index = ();
caf8fe6 by Pascal Bleser at 2010-07-28 210
      foreach my $p (@$packages) {
a19b762 by Pascal Bleser at 2010-08-08 211
         if ($p->{arch} eq "src") {
212
            my $k = basename($p->{location});
213
            die "no id for $k" unless exists $p->{id} and defined $p->{id};
214
            $source_rpm_index{$k} = $p;
215
         }
216
      }
727ab59 by Pascal Bleser at 2010-07-23 217
a19b762 by Pascal Bleser at 2010-08-08 218
      foreach my $p (@$packages) {
b99c742 by Pascal Bleser at 2010-08-05 219
         foreach (qw(repoid distribution)) {
caf8fe6 by Pascal Bleser at 2010-07-28 220
            $p->{$_} = $r->{$_};
727ab59 by Pascal Bleser at 2010-07-23 221
         }
b99c742 by Pascal Bleser at 2010-08-05 222
         $p->{repourl} = $r->{baseurl};
a19b762 by Pascal Bleser at 2010-08-08 223
224
         if (exists $p->{sourcerpm} and $p->{arch} ne "src" and $p->{arch} ne "nosrc") {
225
            my $srpm = $p->{sourcerpm};
226
            my $s = $source_rpm_index{$srpm};
227
            if (defined $s) {
228
               my $sid = $s->{id};
229
               die "undefined id for $srpm" unless defined $sid;
230
               $p->{sourcerpmid} = $sid;
231
            } else {
232
               #warn "failed to find source rpm $srpm";
233
            }
234
         }
235
236
         {
237
            my $g = $p->{rpmgroup};
238
            delete $p->{rpmgroup};
239
            $p->{group_exact} = $g;
240
            $p->{group_last} = $g;
241
         }
727ab59 by Pascal Bleser at 2010-07-23 242
caf8fe6 by Pascal Bleser at 2010-07-28 243
         # post-process
244
         if (exists $p->{description} and defined $p->{description}) {
245
            $p->{description} =~ s/\s*\bAuthors?:?.*$//ms;
246
         }
727ab59 by Pascal Bleser at 2010-07-23 247
         
caf8fe6 by Pascal Bleser at 2010-07-28 248
         if (exists $p->{packager} and defined $p->{packager}) {
249
            $p->{packager} =~ s/\s*<.+@.+>//;
250
            $p->{packager} =~ s/\w.+@.+\w//;
251
            $p->{packager} = HTML::Entities::encode_numeric($p->{packager});
727ab59 by Pascal Bleser at 2010-07-23 252
         }
253
caf8fe6 by Pascal Bleser at 2010-07-28 254
         {
255
            foreach my $tag (qw(requires provides)) {
256
               my @pp = grep { not /^(rpmlib|libc\.so|debuginfo\()/ } @{$p->{$tag}};
257
               $p->{$tag} = \@pp;
727ab59 by Pascal Bleser at 2010-07-23 258
            }
259
         }
260
caf8fe6 by Pascal Bleser at 2010-07-28 261
         $p->{mime} = [];
262
         $p->{perl} = [];
263
         foreach ($p->{provides}) {
264
            push(@{$p->{mime}}, $1) if /^(?:mimetype|mimehandler)\(.+?\)/;
265
            push(@{$p->{perl}}, $1) if /^perl\(.+?\)/;
727ab59 by Pascal Bleser at 2010-07-23 266
         }
267
ba00e9a by Pascal Bleser at 2010-07-29 268
         $p->{tag} = [];
269
         push(@{$p->{tag}}, 'doc') if $p->{name} =~ /-doc$/;
270
         push(@{$p->{tag}}, 'lang') if $p->{name} =~ /-lang$/;
271
         push(@{$p->{tag}}, 'devel') if $p->{name} =~ /-devel$/;
272
         push(@{$p->{tag}}, 'perl') if $p->{name} =~ /^perl-\D$/;
273
         push(@{$p->{tag}}, 'python') if $p->{name} =~ /^python-\D$/;
274
         push(@{$p->{tag}}, 'ruby') if $p->{name} =~ /^ruby(gem)?-\D$/;
275
         push(@{$p->{tag}}, 'lib') if $p->{name} =~ /^lib/;
a19b762 by Pascal Bleser at 2010-08-08 276
         push(@{$p->{tag}}, 'debug') if $p->{name} =~ /\-debug(info|source)$/;
277
         push(@{$p->{tag}}, 'src') if $p->{arch} eq "src" or $p->{arch} eq "nosrc";
ba00e9a by Pascal Bleser at 2010-07-29 278
caf8fe6 by Pascal Bleser at 2010-07-28 279
         # make a Solr document from that
280
         my @fields = ();
281
         while (my ($k, $v) = each(%$p)) {
ba00e9a by Pascal Bleser at 2010-07-29 282
            next if $k eq 'configfile';
283
caf8fe6 by Pascal Bleser at 2010-07-28 284
            if (ref($v) eq 'ARRAY') {
285
               foreach (@$v) {
286
                  die "undef found for $k in ".join("-", ($p->{name}, $p->{version}, $p->{release})) if not defined $_;
287
                  push(@fields, f($k, $_));
288
               }
289
            } elsif (ref($v) eq '') {
290
               push(@fields, f($k, $v));
291
            } else {
292
               die "wtf, a ref ? ($k)";
293
            }
294
         }
727ab59 by Pascal Bleser at 2010-07-23 295
         my $doc = WebService::Solr::Document->new;
296
         $doc->add_fields(@fields);
297
         push(@docs, $doc);
298
      }
299
300
      my $solr_repoid = solr_escape($r->{repoid});
b99c742 by Pascal Bleser at 2010-08-05 301
302
      my @missing;
303
      my @newones;
304
      {
305
         # load all the documents in Solr for the current repoid
306
         my $exres = $solr->search('', {
307
            'q.alt'  => 'repoid:'.$solr_repoid,
308
            'fl'     => 'id,name,version,release,arch,sha',
309
            'rows'   => '9999999',
310
            'start'  => 0,
311
         });
312
         my @exdocs = $exres->docs;
313
314
         {
315
            my %index = map { $_->value_for('sha') => 1 } @docs;
316
            @missing = grep { not exists $index{$_->value_for('sha')} } @exdocs;
317
         }
318
         {
319
            my %index = map { $_->value_for('sha') => 1 } @exdocs;
320
            @newones = grep { not exists $index{$_->value_for('sha')} } @docs;
321
         }
322
      }
323
      if ($verbose) {
324
         print "\n";
325
         print "\t", scalar(@newones), " new packages", "\n";
326
         print "\t", scalar(@missing), " packages have gone missing", "\n";
327
      }
328
329
      if (scalar(@missing) > 0) {
330
         print "\t", "deleting ", scalar(@missing), " packages", "\n" if $verbose;
331
         foreach my $p (@missing) {
a19b762 by Pascal Bleser at 2010-08-08 332
            my $solr_id = solr_escape($p->value_for('id'));
b99c742 by Pascal Bleser at 2010-08-05 333
            $solr->delete_by_query('id:'.$solr_id) or die "failed to delete id:".$solr_id;
334
         }
335
      }
336
      if (scalar(@newones) > 0) {
caf8fe6 by Pascal Bleser at 2010-07-28 337
         my $progress = undef;
338
         if ($verbose) {
b99c742 by Pascal Bleser at 2010-08-05 339
            print "\t", "adding ", scalar(@newones), " documents to Solr", "\n";
caf8fe6 by Pascal Bleser at 2010-07-28 340
            $progress = Term::ProgressBar->new({
b99c742 by Pascal Bleser at 2010-08-05 341
               count => scalar(@newones),
caf8fe6 by Pascal Bleser at 2010-07-28 342
               name  => "adding to Solr",
343
               ETA   => 'linear',
344
            });
345
            $progress->minor(0);
346
         }
b99c742 by Pascal Bleser at 2010-08-05 347
         my $chunk = [];
348
         my $i = 0;
349
         foreach my $d (@newones) {
350
            push(@$chunk, $d);
351
            if (($i % 10) == 0) {
352
               $solr->add($chunk, { overwrite => 1 });
353
               $chunk = [];
354
               $progress->update($i) if $progress;
355
            }
356
            $i++;
357
         }
358
         if (scalar(@$chunk) > 0) {
359
            $solr->add($chunk, { overwrite => 1 });
caf8fe6 by Pascal Bleser at 2010-07-28 360
         }
b99c742 by Pascal Bleser at 2010-08-05 361
         $progress->update(scalar(@newones)) if $progress;
362
363
         #$solr->add(\@newones, { overwrite => 1 });
364
         $total += scalar(@newones);
727ab59 by Pascal Bleser at 2010-07-23 365
      }
366
      print "\t", "committing Solr", "\n" if $verbose;
367
      $solr->commit();
b99c742 by Pascal Bleser at 2010-08-05 368
      print "\n" if $verbose;
727ab59 by Pascal Bleser at 2010-07-23 369
   }
370
371
   # save to cache
caf8fe6 by Pascal Bleser at 2010-07-28 372
   if (exists $repoheaders->{timestamp} and defined $repoheaders->{timestamp} and exists $repoheaders->{last_modified} and defined $repoheaders->{last_modified}) {
727ab59 by Pascal Bleser at 2010-07-23 373
      open(my $fh, '>', $cache) or die "failed to open cache for write: $cache: $!";
caf8fe6 by Pascal Bleser at 2010-07-28 374
      print $fh $repoheaders->{timestamp}, "\n";
375
      print $fh $repoheaders->{last_modified}, "\n";
376
      if (exists $repoheaders->{etag} and defined $repoheaders->{etag}) {
377
         print $fh $repoheaders->{etag}, "\n"; #->header("etag"), "\n";
727ab59 by Pascal Bleser at 2010-07-23 378
      } else {
379
         print $fh "\n";
380
      }
381
      close($fh);
b99c742 by Pascal Bleser at 2010-08-05 382
      print "\t", "saved cache to ", $cache, "\n" if $verbose;
727ab59 by Pascal Bleser at 2010-07-23 383
   }
384
385
}
386
ac9c69e by Pascal Bleser at 2010-07-28 387
if ($total > 0) {
b99c742 by Pascal Bleser at 2010-08-05 388
   print "\n", "optimizing Solr index", "\n" if $verbose;
ac9c69e by Pascal Bleser at 2010-07-28 389
   $solr->optimize();
390
}
caf8fe6 by Pascal Bleser at 2010-07-28 391
ba00e9a by Pascal Bleser at 2010-07-29 392
if (scalar(@gone) > 0) {
393
   print "The following repositories have disappeared:", "\n";
394
   foreach my $r (@gone) {
b99c742 by Pascal Bleser at 2010-08-05 395
      print join("    ", map { $r->{$_} } qw(configfile repoid distribution baseurl)), "\n";
ba00e9a by Pascal Bleser at 2010-07-29 396
   }
397
}
398