#!/usr/bin/perl # vim: set ai et sw=3 ts=3 nu: # # Updates Solr with repository metadata # # by Pascal Bleser # # This library is free software; you can redistribute it and/or modify it # under the terms of the GNU Lesser General Public License as published by # the Free Software Foundation; either version 2.1 of the License, or (at # your option) any later version. # # This library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, # USA. use strict; use warnings; use LWP::UserAgent; use HTTP::Date; use File::Spec; use File::Basename; use HTML::Entities (); use XML::LibXML; use POSIX; use IO::Uncompress::Gunzip; use Term::ProgressBar; use WebService::Solr; use Getopt::Long; use lib './lib'; use RPM_MD; use YaST2_MD; my $repos = "./repos.d"; my $cache_dir = "./cache.d"; my $verbose = undef; my $force = undef; my $validate_introspect = undef; GetOptions( 'v|verbose' => \$verbose, 'f|force' => \$force, ); my $solr_escape_chars = quotemeta( '+-&|!(){}[]^"~*?:\\' ); my @repos = (); my @rfiles = (); if (scalar(@ARGV) > 0) { push(@rfiles, @ARGV); } else { @rfiles = grep { -f } glob($repos.'/*.conf'); } foreach my $rfile (@rfiles) { open(my $fh, '<', $rfile) or die "failed to open $rfile: $!"; while (<$fh>) { chomp; s/#.*$//; s/^\s*//; s/\s*//; next if /^$/; if (/^(\S+)\s+(\S+)\s+(\S+)(?:\s+(\S+))?(?:\s+(\S+))?$/) { my $r = { repoid => $1, distribution => $2, baseurl => $3, configfile => $rfile, }; $r->{mdtype} = defined $4 ? $4 : 'rpmmd'; if (defined $5) { my %flags = map { $_ => 1 } split(/\s*,\s*/, $5); $r->{flags} = \%flags; } else { $r->{flags} = {}; } push(@repos, $r); } else { die "invalid repo spec in $rfile at line $."; } } close($fh); } my $ua = LWP::UserAgent->new( timeout => 10, agent => "webpin-repomanager/1.0", max_redirect => 4, ); $ua->env_proxy(); my $solr = WebService::Solr->new("http://localhost:8983/solr", { autocommit => 0, }); $solr->ping() or die "failed to ping Solr"; sub f($$) { my $name = shift; my $value = shift; my $field = WebService::Solr::Field->new($name => $value); return $field; } sub solr_escape($) { my $v = shift; die "null value passed to solr_escape" unless defined $v; $v =~ s{([$solr_escape_chars])}{\\$1}g; return $v; } my @gone = (); my $total = 0; foreach my $r (@repos) { print $r->{repoid}, "\n" if $verbose; my $cache = File::Spec->catfile($cache_dir, $r->{repoid}.".cache"); { my $dir = dirname($cache); mkdir($dir, 0750) unless -d $dir; } my $timestamp = undef; my $last_modified = undef; my $etag = undef; { if (-e $cache) { open(my $fh, '<', $cache) or die "failed to open cache file $cache: $!"; chomp($timestamp = <$fh>); chomp($last_modified = <$fh>); chomp($etag = <$fh>); close($fh); } } my $h = { last_modified => $last_modified, etag => $etag, timestamp => $timestamp, }; my @docs = (); my $packages = undef; my $repoheaders = undef; { my $pr = undef; if ($r->{mdtype} eq 'rpmmd' or $r->{mdtype} eq 'rpm-md') { eval { $pr = parse_rpmmd($r, $h, $ua, $verbose); #($packages, $repoheaders) = parse_rpmmd($r, $h, $ua, $verbose); }; if ($@) { warn "failed to parse repository ".$r->{repoid}.": ".$@; next; } } elsif ($r->{mdtype} eq 'yast2') { eval { $pr = parse_y2md($r, $h, $ua, $verbose); }; if ($@) { warn "failed to parse repository ".$r->{repoid}.": ".$@; next; } } else { warn "unsupported repository type \"".$r->{mdtype}."\""; next; } if (ref($pr) eq 'ARRAY') { $packages = $pr->[0]; $repoheaders = $pr->[1]; } elsif (ref($pr) eq 'HASH') { push(@gone, $r); next; } elsif (not defined($pr)) { next; } else { warn "unsupported scalar returned by parser: $pr"; next; } if ($validate_introspect) { foreach my $p (@$packages) { die "missing summary in ".join('-', map { $p->{$_} } qw(name version release arch)) unless exists $p->{summary}; while (my ($k, $v) = each(%$p)) { die "found undef for $k in package ".join('-', map { $p->{$_} } qw(name version release arch)) unless defined $v; if (ref($v) eq 'ARRAY') { foreach (@$v) { die "found undef in list $k in package ".join('-', map { $p->{$_} } qw(name version release arch)) unless defined $_; } } } } foreach my $p (@$packages) { if (not exists $p->{id} or not defined $p->{id}) { use Data::Dumper; die "no id: " . Dumper($p); } } } my %source_rpm_index = (); foreach my $p (@$packages) { if ($p->{arch} eq "src") { my $k = basename($p->{location}); die "no id for $k" unless exists $p->{id} and defined $p->{id}; $source_rpm_index{$k} = $p; } } foreach my $p (@$packages) { foreach (qw(repoid distribution)) { $p->{$_} = $r->{$_}; } $p->{repourl} = $r->{baseurl}; if (exists $p->{sourcerpm} and $p->{arch} ne "src" and $p->{arch} ne "nosrc") { my $srpm = $p->{sourcerpm}; my $s = $source_rpm_index{$srpm}; if (defined $s) { my $sid = $s->{id}; die "undefined id for $srpm" unless defined $sid; $p->{sourcerpmid} = $sid; } else { #warn "failed to find source rpm $srpm"; } } { my $g = $p->{rpmgroup}; delete $p->{rpmgroup}; $p->{group_exact} = $g; $p->{group_last} = $g; } # post-process if (exists $p->{description} and defined $p->{description}) { $p->{description} =~ s/\s*\bAuthors?:?.*$//ms; } if (exists $p->{packager} and defined $p->{packager}) { $p->{packager} =~ s/\s*<.+@.+>//; $p->{packager} =~ s/\w.+@.+\w//; $p->{packager} = HTML::Entities::encode_numeric($p->{packager}); } { foreach my $tag (qw(requires provides)) { my @pp = grep { not /^(rpmlib|libc\.so|debuginfo\()/ } @{$p->{$tag}}; $p->{$tag} = \@pp; } } $p->{mime} = []; $p->{perl} = []; foreach ($p->{provides}) { push(@{$p->{mime}}, $1) if /^(?:mimetype|mimehandler)\(.+?\)/; push(@{$p->{perl}}, $1) if /^perl\(.+?\)/; } $p->{tag} = []; push(@{$p->{tag}}, 'doc') if $p->{name} =~ /-doc$/; push(@{$p->{tag}}, 'lang') if $p->{name} =~ /-lang$/; push(@{$p->{tag}}, 'devel') if $p->{name} =~ /-devel$/; push(@{$p->{tag}}, 'perl') if $p->{name} =~ /^perl-\D$/; push(@{$p->{tag}}, 'python') if $p->{name} =~ /^python-\D$/; push(@{$p->{tag}}, 'ruby') if $p->{name} =~ /^ruby(gem)?-\D$/; push(@{$p->{tag}}, 'lib') if $p->{name} =~ /^lib/; push(@{$p->{tag}}, 'debug') if $p->{name} =~ /\-debug(info|source)$/; push(@{$p->{tag}}, 'src') if $p->{arch} eq "src" or $p->{arch} eq "nosrc"; # make a Solr document from that my @fields = (); while (my ($k, $v) = each(%$p)) { next if $k eq 'configfile'; if (ref($v) eq 'ARRAY') { foreach (@$v) { die "undef found for $k in ".join("-", ($p->{name}, $p->{version}, $p->{release})) if not defined $_; push(@fields, f($k, $_)); } } elsif (ref($v) eq '') { push(@fields, f($k, $v)); } else { die "wtf, a ref ? ($k)"; } } my $doc = WebService::Solr::Document->new; $doc->add_fields(@fields); push(@docs, $doc); } my $solr_repoid = solr_escape($r->{repoid}); my @missing; my @newones; { # load all the documents in Solr for the current repoid my $exres = $solr->search('', { 'q.alt' => 'repoid:'.$solr_repoid, 'fl' => 'id,name,version,release,arch,sha', 'rows' => '9999999', 'start' => 0, }); my @exdocs = $exres->docs; { my %index = map { $_->value_for('sha') => 1 } @docs; @missing = grep { not exists $index{$_->value_for('sha')} } @exdocs; } { my %index = map { $_->value_for('sha') => 1 } @exdocs; @newones = grep { not exists $index{$_->value_for('sha')} } @docs; } } if ($verbose) { print "\n"; print "\t", scalar(@newones), " new packages", "\n"; print "\t", scalar(@missing), " packages have gone missing", "\n"; } if (scalar(@missing) > 0) { print "\t", "deleting ", scalar(@missing), " packages", "\n" if $verbose; foreach my $p (@missing) { my $solr_id = solr_escape($p->value_for('id')); $solr->delete_by_query('id:'.$solr_id) or die "failed to delete id:".$solr_id; } } if (scalar(@newones) > 0) { my $progress = undef; if ($verbose) { print "\t", "adding ", scalar(@newones), " documents to Solr", "\n"; $progress = Term::ProgressBar->new({ count => scalar(@newones), name => "adding to Solr", ETA => 'linear', }); $progress->minor(0); } my $chunk = []; my $i = 0; foreach my $d (@newones) { push(@$chunk, $d); if (($i % 10) == 0) { $solr->add($chunk, { overwrite => 1 }); $chunk = []; $progress->update($i) if $progress; } $i++; } if (scalar(@$chunk) > 0) { $solr->add($chunk, { overwrite => 1 }); } $progress->update(scalar(@newones)) if $progress; #$solr->add(\@newones, { overwrite => 1 }); $total += scalar(@newones); } print "\t", "committing Solr", "\n" if $verbose; $solr->commit(); print "\n" if $verbose; } # save to cache if (exists $repoheaders->{timestamp} and defined $repoheaders->{timestamp} and exists $repoheaders->{last_modified} and defined $repoheaders->{last_modified}) { open(my $fh, '>', $cache) or die "failed to open cache for write: $cache: $!"; print $fh $repoheaders->{timestamp}, "\n"; print $fh $repoheaders->{last_modified}, "\n"; if (exists $repoheaders->{etag} and defined $repoheaders->{etag}) { print $fh $repoheaders->{etag}, "\n"; #->header("etag"), "\n"; } else { print $fh "\n"; } close($fh); print "\t", "saved cache to ", $cache, "\n" if $verbose; } } if ($total > 0) { print "\n", "optimizing Solr index", "\n" if $verbose; $solr->optimize(); } if (scalar(@gone) > 0) { print "The following repositories have disappeared:", "\n"; foreach my $r (@gone) { print join(" ", map { $r->{$_} } qw(configfile repoid distribution baseurl)), "\n"; } }