Codebase list libcatmandu-wikidata-perl / 01399d9
extended and renamed fixes Jakob Voss 10 years ago
11 changed file(s) with 257 addition(s) and 142 deletion(s). Raw diff Collapse all Expand all
44 - "5.16"
55 - "5.14"
66 - "5.12"
7 - "5.10"
87
98 before_install:
109 - git config --global user.name "TravisCI"
00 name = Catmandu-Wikidata
11 license = Perl_5
2 version = 0.03
2 version = 0.04
33 copyright_year = 2014
44 author = Jakob Voß
55 copyright_holder = Jakob Voß
0 package Catmandu::Fix::wd_language;
1 #ABSTRACT: Limit string values to a selected language
2 #VERSION
3 use Catmandu::Sane;
4 use Moo;
5
6 has language => (is => 'ro', required => 1);
7
8 has force => (is => 'ro');
9
10 around BUILDARGS => sub {
11 my ($orig, $class, $language) = @_;
12 $orig->($class, { language => $language });
13 };
14
15 sub fix {
16 my ($self, $data) = @_;
17 my $language = $self->language;
18
19 foreach my $what (qw(labels descriptions)) {
20 next unless exists $data->{$what};
21 my $field = $data->{$what};
22 if (ref $field) { # keep simple strings as given
23 my $string = $field->{$language};
24 if (defined $string) {
25 $data->{$what} = ref $string ? $string->{value} : $string;
26 } else {
27 delete $data->{$what};
28 }
29 }
30 }
31
32 if (exists $data->{labels}) {
33 $data->{label} = delete $data->{labels};
34 }
35
36 if (exists $data->{descriptions}) {
37 $data->{description} = delete $data->{descriptions};
38 }
39
40 if (ref $data->{aliases} and ref $data->{aliases} eq 'HASH') {
41 my $aliases = $data->{aliases}->{$language};
42 if (defined $aliases) {
43 $data->{aliases} = [
44 map { ref $_ ? $_->{value} : $_ } @$aliases
45 ];
46 } else {
47 $data->{aliases} = [ ];
48 }
49 }
50
51 # TODO: only delete of string of requested language was found (or force)
52
53 $data;
54 }
55
56 1;
57
58 =head1 DESCRIPTION
59
60 This L<Catmandu::Fix> modifies a Wikidata entity record, as imported by
61 L<Catmandu::Importer::Wikidata>, by deleting all language tagged strings (in
62 C<aliases>, C<labels>, and C<descriptions>) except a selected language. The
63 strings are also simplified as done with L<Catmandu::Fix::wd_simple_strings>.
64
65 =encoding utf8
0 package Catmandu::Fix::wd_simple;
1 #ABSTRACT: Simplify Wikidata entity records
2 #VERSION
3 use Catmandu::Sane;
4 use Moo;
5
6 use Catmandu::Fix::wd_simple_strings;
7 use Catmandu::Fix::wd_simple_claims;
8
9 sub fix {
10 my ($self, $data) = @_;
11
12 Catmandu::Fix::wd_simple_strings::fix($self,$data);
13 Catmandu::Fix::wd_simple_claims::fix($self,$data);
14
15 if (my $hash = $data->{sitelinks}) {
16 foreach my $lang (keys %$hash) {
17 delete $hash->{$lang}->{site};
18 }
19 }
20
21 $data;
22 }
23
24 1;
25
26 =head1 DESCRIPTION
27
28 This L<Catmandu::Fix> simplifies a Wikidata entity record by applying both
29 L<Catmandu::Fix::wd_simple_strings> and L<Catmandu::Fix::wd_simple_claims>. It
30 further simplifies sitelinks by removing redundant fields.
31
32 =encoding utf8
0 package Catmandu::Fix::wd_simple_claims;
1 #ABSTRACT: Simplify claims of a Wikidata entity record
2 #VERSION
3 use Catmandu::Sane;
4 use Moo;
5
6 # TODO: also support other snak types
7 # See https://www.mediawiki.org/wiki/Wikibase/DataModel for more
8 sub simplify_snak {
9 my ($snak) = @_;
10 delete $snak->{property}; # redundant
11 if ($snak->{datavalue}) { # innecessary nesting
12 for (keys %{$snak->{datavalue}}) {
13 $snak->{$_} = $snak->{datavalue}->{$_};
14 }
15 if ($snak->{datatype} eq 'wikibase-item') {
16 $snak->{value} = $snak->{value}->{'numeric-id'};;
17 }
18 delete $snak->{type}; # equals to datatype
19 delete $snak->{datavalue};
20 }
21 }
22
23 sub fix {
24 my ($self, $data) = @_;
25
26 my $claims = $data->{claims} or return $data;
27
28 while (my ($property,$cs) = each %$claims) {
29 for my $c (@$cs) {
30 delete $c->{id}; # internal
31 delete $c->{type}; # always "statement"
32 simplify_snak($c->{mainsnak});
33 for (keys %{$c->{mainsnak}}) { # innecessary nesting
34 $c->{$_} = $c->{mainsnak}->{$_};
35 }
36 delete $c->{mainsnak};
37 if ($c->{references}) {
38 for my $r (@{$c->{references}}) {
39 delete $r->{hash}; # internal
40 next unless $r->{snaks};
41 for my $snaks (values %{$r->{snaks}}) {
42 for my $snak (@$snaks) {
43 simplify_snak($snak);
44 }
45 }
46 }
47 }
48 }
49 }
50
51 $data;
52 }
53
54 1;
55
56 =head1 DESCRIPTION
57
58 This L<Catmandu::Fix> modifies a Wikidata entity record by simplifying its claims.
59
60 =head1 SEE ALSO
61
62 L<Catmandu::Fix::wd_simple> applies both L<Catmandu::Fix::wd_simple_claims> and
63 L<Catmandu::Fix::wd_simple_strings>.
64
65 =encoding utf8
0 package Catmandu::Fix::wd_simple_strings;
1 #ABSTRACT: Simplify labels, descriptions, and aliases of Wikidata entity records
2 #VERSION
3 use Catmandu::Sane;
4 use Moo;
5
6 sub fix {
7 my ($self, $data) = @_;
8
9 foreach my $what (qw(labels descriptions)) {
10 my $hash = $data->{$what};
11 if ($hash) {
12 foreach my $lang (keys %$hash) {
13 $hash->{$lang} = $hash->{$lang}->{value};
14 };
15 }
16 }
17
18 if (my $hash = $data->{aliases}) {
19 foreach my $lang (keys %$hash) {
20 $hash->{$lang} = [ map { $_->{value} } @{$hash->{$lang}} ];
21 }
22 }
23
24 $data;
25 }
26
27 1;
28
29 =head1 DESCRIPTION
30
31 This L<Catmandu::Fix> modifies a Wikidata entity record by simplifying the
32 labels, aliases, and descriptions. In particular it converts
33
34 "en": { "language: "en", "value": "foo" }
35
36 "en": [ { "language: "en", "value": "foo" },
37 { "language: "en", "value": "bar" } ]
38
39 to
40
41 "en": "foo"
42
43 "en": ["foo","bar"]
44
45 =head1 SEE ALSO
46
47 L<Catmandu::Fix::wd_simple> applies both L<Catmandu::Fix::wd_simple_strings>
48 and L<Catmandu::Fix::wd_simple_claims>.
49
50 =encoding utf8
+0
-56
lib/Catmandu/Fix/wdata_retain_language.pm less more
0 package Catmandu::Fix::wdata_retain_language;
1 #ABSTRACT: Limit string values to a selected language
2 #VERSION
3 use Catmandu::Sane;
4 use Moo;
5
6 has language => (is => 'ro', required => 1);
7
8 has force => (is => 'ro');
9
10 around BUILDARGS => sub {
11 my ($orig, $class, $language) = @_;
12 $orig->($class, { language => $language });
13 };
14
15 sub fix {
16 my ($self, $data) = @_;
17 my $language = $self->language;
18
19 $data->{description} = eval {
20 $data->{descriptions}->{$language}->{value}
21 };
22 $data->{label} = eval {
23 $data->{labels}->{$language}->{value}
24 };
25 $data->{alias} = [ eval {
26 map { $_->{value} } @{$data->{aliases}->{$language}}
27 } ]; # TODO: how to express this as normal fix with move_field?
28
29 # TODO: only delete of string of requested language was found (or force)
30 delete $data->{$_} for qw(aliases labels descriptions);
31
32 $data;
33 }
34
35 1;
36
37 =head1 DESCRIPTION
38
39 This L<Catmandu::Fix> modifies a Wikidata entity record, as imported by
40 L<Catmandu::Importer::Wikidata>, by deleting all language tagged strings (in
41 C<aliases>, C<labels>, and C<descriptions>) expect a selected language. The
42 fix
43
44 wdata_retain_language('fr');
45
46 is roughly equivalent to
47
48 move_field('labels.fr.value','label');
49 move_field('descriptions.fr.value','description');
50 move_field('aliases.fr.*.value','alias.$append');
51
52 Modification of additional fields may be added in a future release of this
53 module.
54
55 =encoding utf8
+0
-66
lib/Catmandu/Fix/wdata_simplify_claims.pm less more
0 package Catmandu::Fix::wdata_simplify_claims;
1 #ABSTRACT: Simplify the claims of a Wikidata entity
2 #VERSION
3 use Catmandu::Sane;
4 use Moo;
5
6 # TODO: this only covers some snak types
7 # See https://meta.wikimedia.org/wiki/Wikidata/Data_model#Snaks for more
8 sub simplify_snak {
9 my ($snak) = @_;
10 delete $snak->{property}; # redundant
11 if ($snak->{datavalue}) { # innecessary nesting
12 for (keys %{$snak->{datavalue}}) {
13 $snak->{$_} = $snak->{datavalue}->{$_};
14 }
15 #if ($snak->{type} eq 'wikibase-entityid') {
16 # $snak->{entity} = 'P'.$snak->{value}->{'numeric-id'};
17 # delete $snak->{value};
18 #}
19 delete $snak->{datavalue};
20 }
21
22 # TODO add value type (such as 'URL') as soon as it is included in the JSON
23 # e.g. P856 in Q52 (Wikipedia)
24 }
25
26 sub fix {
27 my ($self, $data) = @_;
28
29 my $claims = $data->{claims} or return $data;
30
31 while (my ($property,$cs) = each %$claims) {
32 for my $c (@$cs) {
33 delete $c->{id}; # internal
34 delete $c->{type}; # always "statement"
35 simplify_snak($c->{mainsnak});
36 for (keys %{$c->{mainsnak}}) { # innecessary nesting
37 $c->{$_} = $c->{mainsnak}->{$_};
38 }
39 delete $c->{mainsnak};
40 if ($c->{references}) {
41 for my $r (@{$c->{references}}) {
42 delete $r->{hash}; # internal
43 next unless $r->{snaks};
44 for my $snaks (values %{$r->{snaks}}) {
45 for my $snak (@$snaks) {
46 simplify_snak($snak);
47 }
48 }
49 }
50 }
51 }
52 }
53
54 $data;
55 }
56
57 1;
58
59 =head1 DESCRIPTION
60
61 This L<Catmandu::Fix> modifies a Wikidata entity JSON record by simplifying the
62 C<claims> entry. The simplification is highly experimental and may change in a
63 future release of this module!
64
65 =encoding utf8
6868 }
6969 die "invalid site $site" if $site !~ /^[a-z]+([_-][a-z])*$/;
7070 $site =~ s/-/_/g;
71 # TODO: pass multiple sites|titles
7271 $vars = { sites => $site, titles => $title };
7372 }
7473
9392
9493 sub response_hook {
9594 my ($self, $data) = @_;
96 # TODO: better error handling
97 return [ values %{$data->{entities}} ];
95 return unless ref $data and ref $data->{entities} eq 'HASH';
96 return [
97 map {
98 $_->{missing} = 1 if exists $_->{missing};
99 $_;
100 } grep { ref $_ eq 'HASH'; }
101 values %{$data->{entities}}
102 ];
98103 }
99104
100105 1;
104109 catmandu convert Wikidata --ids Q1,P227
105110 catmandu convert Wikidata --site dewiki --title Wahnsinn
106111
107 echo Q7 | catmandu convert Wikidata
112 echo Q1 | catmandu convert Wikidata
108113 echo Wahnsinn | catmandu convert Wikidata --site dewiki
109114 echo dewiki:Wahnsinn | catmandu convert Wikidata
110115
116 echo Q1 | catmandu convert Wikidata --fix 'retain_field("labels")'
117
111118 =head1 DESCRIPTION
112119
113120 This L<Catmandu::Importer> queries Wikidata for entities, given by their
114121 Wikidata identifier (C<Q...>, C<P...>) or by a title in some know Wikidata
115 site, such as the English Wikipedia (C<enwiki>).
122 site, such as the English Wikipedia (C<enwiki>). The entities are either
123 specified as options (C<ids>, C<site>, and/pr C<title>) or as line-separated
124 input values. By default, the raw JSON structure of each Wikidata entity is
125 returned one by one. Entities not found are returned with the C<missing>
126 property set to C<1> like this:
116127
117 See L<Catmandu::Wikidata> for a synopsis.
128 { "id": "Q7", "missing": "1" }
118129
119 By default, the raw JSON structure of each Wikidata entity is returned one by
120 one. Future versions of this module may further expand the entity data to make
121 more easily use of it.
130 To further process the JSON structure L<Catmandu::Wikidata> contains several
131 Catmandu fixes, e.g. to only retain a selected language.
122132
123133 =head1 CONFIGURATION
124134
125135 This importer extends L<Catmandu::Importer::getJSON>, so it can be configured
126 with options C<agent>, C<timeout>, C<headers>, C<proxy>, and C<dry>.
136 with options C<agent>, C<timeout>, C<headers>, C<proxy>, and C<dry>. Additional
137 options include:
127138
128139 =over
129140
1010 catmandu convert Wkidata --title dewiki:Metadaten to JSON --pretty
1111
1212 catmandu convert Wikidata --title "Emma Goldman" \
13 --fix "wdata_retain_language('en')" to JSON --pretty
13 --fix "wd_language('en')" to JSON --pretty
1414
1515 =head1 DESCRIPTION
1616
2525
2626 Import entities from L<http://www.wikidata.org/>.
2727
28 =item L<Catmandu::Fix::wdata_retain_language>
28 =item L<Catmandu::Fix::wd_language>
2929
30 Provides the fix C<wdata_retain_language($language)> to limit the values of
31 field C<aliases>, C<labels>, and C<descriptions> to a selected language.
30 Provides the fix C<wd_language($language)> to limit the values of aliases,
31 labels, and descriptions to a selected language.
3232
33 =item L<Catmandu::Fix::wdata_simplify_claims>
33 =item L<Catmandu::Fix::wd_simple_strings>
3434
35 Simplfies the C<claims> field of a Wikidata entity record.
35 Simplifies labels, descriptions, and aliases of Wikidata entity record.
36
37 =item L<Catmandu::Fix::wd_simple_claims>
38
39 Simplifies claims of a Wikidata entity record.
40
41 =item L<Catmandu::Fix::wd_simple>
42
43 Applies L<Catmandu::Fix::wd_simple_strings> and
44 L<Catmandu::Fix::wd_simple_claims>.
3645
3746 =back
3847
22 use Test::More;
33
44 use_ok 'Catmandu::Importer::Wikidata';
5 use_ok 'Catmandu::Fix::wdata_retain_language';
6 use_ok 'Catmandu::Fix::wdata_simplify_claims';
5 use_ok 'Catmandu::Fix::wd_language';
6 use_ok 'Catmandu::Fix::wd_simple_strings';
7 use_ok 'Catmandu::Fix::wd_simple_claims';
8 use_ok 'Catmandu::Fix::wd_simple';
79 use_ok 'Catmandu::Wikidata';
810
911 done_testing;