extended and renamed fixes
Jakob Voss
10 years ago
4 | 4 | - "5.16" |
5 | 5 | - "5.14" |
6 | 6 | - "5.12" |
7 | - "5.10" | |
8 | 7 | |
9 | 8 | before_install: |
10 | 9 | - git config --global user.name "TravisCI" |
0 | 0 | name = Catmandu-Wikidata |
1 | 1 | license = Perl_5 |
2 | version = 0.03 | |
2 | version = 0.04 | |
3 | 3 | copyright_year = 2014 |
4 | 4 | author = Jakob Voß |
5 | 5 | copyright_holder = Jakob Voß |
0 | package Catmandu::Fix::wd_language; | |
1 | #ABSTRACT: Limit string values to a selected language | |
2 | #VERSION | |
3 | use Catmandu::Sane; | |
4 | use Moo; | |
5 | ||
6 | has language => (is => 'ro', required => 1); | |
7 | ||
8 | has force => (is => 'ro'); | |
9 | ||
10 | around BUILDARGS => sub { | |
11 | my ($orig, $class, $language) = @_; | |
12 | $orig->($class, { language => $language }); | |
13 | }; | |
14 | ||
15 | sub fix { | |
16 | my ($self, $data) = @_; | |
17 | my $language = $self->language; | |
18 | ||
19 | foreach my $what (qw(labels descriptions)) { | |
20 | next unless exists $data->{$what}; | |
21 | my $field = $data->{$what}; | |
22 | if (ref $field) { # keep simple strings as given | |
23 | my $string = $field->{$language}; | |
24 | if (defined $string) { | |
25 | $data->{$what} = ref $string ? $string->{value} : $string; | |
26 | } else { | |
27 | delete $data->{$what}; | |
28 | } | |
29 | } | |
30 | } | |
31 | ||
32 | if (exists $data->{labels}) { | |
33 | $data->{label} = delete $data->{labels}; | |
34 | } | |
35 | ||
36 | if (exists $data->{descriptions}) { | |
37 | $data->{description} = delete $data->{descriptions}; | |
38 | } | |
39 | ||
40 | if (ref $data->{aliases} and ref $data->{aliases} eq 'HASH') { | |
41 | my $aliases = $data->{aliases}->{$language}; | |
42 | if (defined $aliases) { | |
43 | $data->{aliases} = [ | |
44 | map { ref $_ ? $_->{value} : $_ } @$aliases | |
45 | ]; | |
46 | } else { | |
47 | $data->{aliases} = [ ]; | |
48 | } | |
49 | } | |
50 | ||
51 | # TODO: only delete of string of requested language was found (or force) | |
52 | ||
53 | $data; | |
54 | } | |
55 | ||
56 | 1; | |
57 | ||
58 | =head1 DESCRIPTION | |
59 | ||
60 | This L<Catmandu::Fix> modifies a Wikidata entity record, as imported by | |
61 | L<Catmandu::Importer::Wikidata>, by deleting all language tagged strings (in | |
62 | C<aliases>, C<labels>, and C<descriptions>) except a selected language. The | |
63 | strings are also simplified as done with L<Catmandu::Fix::wd_simple_strings>. | |
64 | ||
65 | =encoding utf8 |
0 | package Catmandu::Fix::wd_simple; | |
1 | #ABSTRACT: Simplify Wikidata entity records | |
2 | #VERSION | |
3 | use Catmandu::Sane; | |
4 | use Moo; | |
5 | ||
6 | use Catmandu::Fix::wd_simple_strings; | |
7 | use Catmandu::Fix::wd_simple_claims; | |
8 | ||
9 | sub fix { | |
10 | my ($self, $data) = @_; | |
11 | ||
12 | Catmandu::Fix::wd_simple_strings::fix($self,$data); | |
13 | Catmandu::Fix::wd_simple_claims::fix($self,$data); | |
14 | ||
15 | if (my $hash = $data->{sitelinks}) { | |
16 | foreach my $lang (keys %$hash) { | |
17 | delete $hash->{$lang}->{site}; | |
18 | } | |
19 | } | |
20 | ||
21 | $data; | |
22 | } | |
23 | ||
24 | 1; | |
25 | ||
26 | =head1 DESCRIPTION | |
27 | ||
28 | This L<Catmandu::Fix> simplifies a Wikidata entity record by applying both | |
29 | L<Catmandu::Fix::wd_simple_strings> and L<Catmandu::Fix::wd_simple_claims>. It | |
30 | further simplifies sitelinks by removing redundant fields. | |
31 | ||
32 | =encoding utf8 |
0 | package Catmandu::Fix::wd_simple_claims; | |
1 | #ABSTRACT: Simplify claims of a Wikidata entity record | |
2 | #VERSION | |
3 | use Catmandu::Sane; | |
4 | use Moo; | |
5 | ||
6 | # TODO: also support other snak types | |
7 | # See https://www.mediawiki.org/wiki/Wikibase/DataModel for more | |
8 | sub simplify_snak { | |
9 | my ($snak) = @_; | |
10 | delete $snak->{property}; # redundant | |
11 | if ($snak->{datavalue}) { # innecessary nesting | |
12 | for (keys %{$snak->{datavalue}}) { | |
13 | $snak->{$_} = $snak->{datavalue}->{$_}; | |
14 | } | |
15 | if ($snak->{datatype} eq 'wikibase-item') { | |
16 | $snak->{value} = $snak->{value}->{'numeric-id'};; | |
17 | } | |
18 | delete $snak->{type}; # equals to datatype | |
19 | delete $snak->{datavalue}; | |
20 | } | |
21 | } | |
22 | ||
23 | sub fix { | |
24 | my ($self, $data) = @_; | |
25 | ||
26 | my $claims = $data->{claims} or return $data; | |
27 | ||
28 | while (my ($property,$cs) = each %$claims) { | |
29 | for my $c (@$cs) { | |
30 | delete $c->{id}; # internal | |
31 | delete $c->{type}; # always "statement" | |
32 | simplify_snak($c->{mainsnak}); | |
33 | for (keys %{$c->{mainsnak}}) { # innecessary nesting | |
34 | $c->{$_} = $c->{mainsnak}->{$_}; | |
35 | } | |
36 | delete $c->{mainsnak}; | |
37 | if ($c->{references}) { | |
38 | for my $r (@{$c->{references}}) { | |
39 | delete $r->{hash}; # internal | |
40 | next unless $r->{snaks}; | |
41 | for my $snaks (values %{$r->{snaks}}) { | |
42 | for my $snak (@$snaks) { | |
43 | simplify_snak($snak); | |
44 | } | |
45 | } | |
46 | } | |
47 | } | |
48 | } | |
49 | } | |
50 | ||
51 | $data; | |
52 | } | |
53 | ||
54 | 1; | |
55 | ||
56 | =head1 DESCRIPTION | |
57 | ||
58 | This L<Catmandu::Fix> modifies a Wikidata entity record by simplifying its claims. | |
59 | ||
60 | =head1 SEE ALSO | |
61 | ||
62 | L<Catmandu::Fix::wd_simple> applies both L<Catmandu::Fix::wd_simple_claims> and | |
63 | L<Catmandu::Fix::wd_simple_strings>. | |
64 | ||
65 | =encoding utf8 |
0 | package Catmandu::Fix::wd_simple_strings; | |
1 | #ABSTRACT: Simplify labels, descriptions, and aliases of Wikidata entity records | |
2 | #VERSION | |
3 | use Catmandu::Sane; | |
4 | use Moo; | |
5 | ||
6 | sub fix { | |
7 | my ($self, $data) = @_; | |
8 | ||
9 | foreach my $what (qw(labels descriptions)) { | |
10 | my $hash = $data->{$what}; | |
11 | if ($hash) { | |
12 | foreach my $lang (keys %$hash) { | |
13 | $hash->{$lang} = $hash->{$lang}->{value}; | |
14 | }; | |
15 | } | |
16 | } | |
17 | ||
18 | if (my $hash = $data->{aliases}) { | |
19 | foreach my $lang (keys %$hash) { | |
20 | $hash->{$lang} = [ map { $_->{value} } @{$hash->{$lang}} ]; | |
21 | } | |
22 | } | |
23 | ||
24 | $data; | |
25 | } | |
26 | ||
27 | 1; | |
28 | ||
29 | =head1 DESCRIPTION | |
30 | ||
31 | This L<Catmandu::Fix> modifies a Wikidata entity record by simplifying the | |
32 | labels, aliases, and descriptions. In particular it converts | |
33 | ||
34 | "en": { "language: "en", "value": "foo" } | |
35 | ||
36 | "en": [ { "language: "en", "value": "foo" }, | |
37 | { "language: "en", "value": "bar" } ] | |
38 | ||
39 | to | |
40 | ||
41 | "en": "foo" | |
42 | ||
43 | "en": ["foo","bar"] | |
44 | ||
45 | =head1 SEE ALSO | |
46 | ||
47 | L<Catmandu::Fix::wd_simple> applies both L<Catmandu::Fix::wd_simple_strings> | |
48 | and L<Catmandu::Fix::wd_simple_claims>. | |
49 | ||
50 | =encoding utf8 |
0 | package Catmandu::Fix::wdata_retain_language; | |
1 | #ABSTRACT: Limit string values to a selected language | |
2 | #VERSION | |
3 | use Catmandu::Sane; | |
4 | use Moo; | |
5 | ||
6 | has language => (is => 'ro', required => 1); | |
7 | ||
8 | has force => (is => 'ro'); | |
9 | ||
10 | around BUILDARGS => sub { | |
11 | my ($orig, $class, $language) = @_; | |
12 | $orig->($class, { language => $language }); | |
13 | }; | |
14 | ||
15 | sub fix { | |
16 | my ($self, $data) = @_; | |
17 | my $language = $self->language; | |
18 | ||
19 | $data->{description} = eval { | |
20 | $data->{descriptions}->{$language}->{value} | |
21 | }; | |
22 | $data->{label} = eval { | |
23 | $data->{labels}->{$language}->{value} | |
24 | }; | |
25 | $data->{alias} = [ eval { | |
26 | map { $_->{value} } @{$data->{aliases}->{$language}} | |
27 | } ]; # TODO: how to express this as normal fix with move_field? | |
28 | ||
29 | # TODO: only delete of string of requested language was found (or force) | |
30 | delete $data->{$_} for qw(aliases labels descriptions); | |
31 | ||
32 | $data; | |
33 | } | |
34 | ||
35 | 1; | |
36 | ||
37 | =head1 DESCRIPTION | |
38 | ||
39 | This L<Catmandu::Fix> modifies a Wikidata entity record, as imported by | |
40 | L<Catmandu::Importer::Wikidata>, by deleting all language tagged strings (in | |
41 | C<aliases>, C<labels>, and C<descriptions>) expect a selected language. The | |
42 | fix | |
43 | ||
44 | wdata_retain_language('fr'); | |
45 | ||
46 | is roughly equivalent to | |
47 | ||
48 | move_field('labels.fr.value','label'); | |
49 | move_field('descriptions.fr.value','description'); | |
50 | move_field('aliases.fr.*.value','alias.$append'); | |
51 | ||
52 | Modification of additional fields may be added in a future release of this | |
53 | module. | |
54 | ||
55 | =encoding utf8 |
0 | package Catmandu::Fix::wdata_simplify_claims; | |
1 | #ABSTRACT: Simplify the claims of a Wikidata entity | |
2 | #VERSION | |
3 | use Catmandu::Sane; | |
4 | use Moo; | |
5 | ||
6 | # TODO: this only covers some snak types | |
7 | # See https://meta.wikimedia.org/wiki/Wikidata/Data_model#Snaks for more | |
8 | sub simplify_snak { | |
9 | my ($snak) = @_; | |
10 | delete $snak->{property}; # redundant | |
11 | if ($snak->{datavalue}) { # innecessary nesting | |
12 | for (keys %{$snak->{datavalue}}) { | |
13 | $snak->{$_} = $snak->{datavalue}->{$_}; | |
14 | } | |
15 | #if ($snak->{type} eq 'wikibase-entityid') { | |
16 | # $snak->{entity} = 'P'.$snak->{value}->{'numeric-id'}; | |
17 | # delete $snak->{value}; | |
18 | #} | |
19 | delete $snak->{datavalue}; | |
20 | } | |
21 | ||
22 | # TODO add value type (such as 'URL') as soon as it is included in the JSON | |
23 | # e.g. P856 in Q52 (Wikipedia) | |
24 | } | |
25 | ||
26 | sub fix { | |
27 | my ($self, $data) = @_; | |
28 | ||
29 | my $claims = $data->{claims} or return $data; | |
30 | ||
31 | while (my ($property,$cs) = each %$claims) { | |
32 | for my $c (@$cs) { | |
33 | delete $c->{id}; # internal | |
34 | delete $c->{type}; # always "statement" | |
35 | simplify_snak($c->{mainsnak}); | |
36 | for (keys %{$c->{mainsnak}}) { # innecessary nesting | |
37 | $c->{$_} = $c->{mainsnak}->{$_}; | |
38 | } | |
39 | delete $c->{mainsnak}; | |
40 | if ($c->{references}) { | |
41 | for my $r (@{$c->{references}}) { | |
42 | delete $r->{hash}; # internal | |
43 | next unless $r->{snaks}; | |
44 | for my $snaks (values %{$r->{snaks}}) { | |
45 | for my $snak (@$snaks) { | |
46 | simplify_snak($snak); | |
47 | } | |
48 | } | |
49 | } | |
50 | } | |
51 | } | |
52 | } | |
53 | ||
54 | $data; | |
55 | } | |
56 | ||
57 | 1; | |
58 | ||
59 | =head1 DESCRIPTION | |
60 | ||
61 | This L<Catmandu::Fix> modifies a Wikidata entity JSON record by simplifying the | |
62 | C<claims> entry. The simplification is highly experimental and may change in a | |
63 | future release of this module! | |
64 | ||
65 | =encoding utf8 |
68 | 68 | } |
69 | 69 | die "invalid site $site" if $site !~ /^[a-z]+([_-][a-z])*$/; |
70 | 70 | $site =~ s/-/_/g; |
71 | # TODO: pass multiple sites|titles | |
72 | 71 | $vars = { sites => $site, titles => $title }; |
73 | 72 | } |
74 | 73 | |
93 | 92 | |
94 | 93 | sub response_hook { |
95 | 94 | my ($self, $data) = @_; |
96 | # TODO: better error handling | |
97 | return [ values %{$data->{entities}} ]; | |
95 | return unless ref $data and ref $data->{entities} eq 'HASH'; | |
96 | return [ | |
97 | map { | |
98 | $_->{missing} = 1 if exists $_->{missing}; | |
99 | $_; | |
100 | } grep { ref $_ eq 'HASH'; } | |
101 | values %{$data->{entities}} | |
102 | ]; | |
98 | 103 | } |
99 | 104 | |
100 | 105 | 1; |
104 | 109 | catmandu convert Wikidata --ids Q1,P227 |
105 | 110 | catmandu convert Wikidata --site dewiki --title Wahnsinn |
106 | 111 | |
107 | echo Q7 | catmandu convert Wikidata | |
112 | echo Q1 | catmandu convert Wikidata | |
108 | 113 | echo Wahnsinn | catmandu convert Wikidata --site dewiki |
109 | 114 | echo dewiki:Wahnsinn | catmandu convert Wikidata |
110 | 115 | |
116 | echo Q1 | catmandu convert Wikidata --fix 'retain_field("labels")' | |
117 | ||
111 | 118 | =head1 DESCRIPTION |
112 | 119 | |
113 | 120 | This L<Catmandu::Importer> queries Wikidata for entities, given by their |
114 | 121 | Wikidata identifier (C<Q...>, C<P...>) or by a title in some know Wikidata |
115 | site, such as the English Wikipedia (C<enwiki>). | |
122 | site, such as the English Wikipedia (C<enwiki>). The entities are either | |
123 | specified as options (C<ids>, C<site>, and/pr C<title>) or as line-separated | |
124 | input values. By default, the raw JSON structure of each Wikidata entity is | |
125 | returned one by one. Entities not found are returned with the C<missing> | |
126 | property set to C<1> like this: | |
116 | 127 | |
117 | See L<Catmandu::Wikidata> for a synopsis. | |
128 | { "id": "Q7", "missing": "1" } | |
118 | 129 | |
119 | By default, the raw JSON structure of each Wikidata entity is returned one by | |
120 | one. Future versions of this module may further expand the entity data to make | |
121 | more easily use of it. | |
130 | To further process the JSON structure L<Catmandu::Wikidata> contains several | |
131 | Catmandu fixes, e.g. to only retain a selected language. | |
122 | 132 | |
123 | 133 | =head1 CONFIGURATION |
124 | 134 | |
125 | 135 | This importer extends L<Catmandu::Importer::getJSON>, so it can be configured |
126 | with options C<agent>, C<timeout>, C<headers>, C<proxy>, and C<dry>. | |
136 | with options C<agent>, C<timeout>, C<headers>, C<proxy>, and C<dry>. Additional | |
137 | options include: | |
127 | 138 | |
128 | 139 | =over |
129 | 140 |
10 | 10 | catmandu convert Wkidata --title dewiki:Metadaten to JSON --pretty |
11 | 11 | |
12 | 12 | catmandu convert Wikidata --title "Emma Goldman" \ |
13 | --fix "wdata_retain_language('en')" to JSON --pretty | |
13 | --fix "wd_language('en')" to JSON --pretty | |
14 | 14 | |
15 | 15 | =head1 DESCRIPTION |
16 | 16 | |
25 | 25 | |
26 | 26 | Import entities from L<http://www.wikidata.org/>. |
27 | 27 | |
28 | =item L<Catmandu::Fix::wdata_retain_language> | |
28 | =item L<Catmandu::Fix::wd_language> | |
29 | 29 | |
30 | Provides the fix C<wdata_retain_language($language)> to limit the values of | |
31 | field C<aliases>, C<labels>, and C<descriptions> to a selected language. | |
30 | Provides the fix C<wd_language($language)> to limit the values of aliases, | |
31 | labels, and descriptions to a selected language. | |
32 | 32 | |
33 | =item L<Catmandu::Fix::wdata_simplify_claims> | |
33 | =item L<Catmandu::Fix::wd_simple_strings> | |
34 | 34 | |
35 | Simplfies the C<claims> field of a Wikidata entity record. | |
35 | Simplifies labels, descriptions, and aliases of Wikidata entity record. | |
36 | ||
37 | =item L<Catmandu::Fix::wd_simple_claims> | |
38 | ||
39 | Simplifies claims of a Wikidata entity record. | |
40 | ||
41 | =item L<Catmandu::Fix::wd_simple> | |
42 | ||
43 | Applies L<Catmandu::Fix::wd_simple_strings> and | |
44 | L<Catmandu::Fix::wd_simple_claims>. | |
36 | 45 | |
37 | 46 | =back |
38 | 47 |
2 | 2 | use Test::More; |
3 | 3 | |
4 | 4 | use_ok 'Catmandu::Importer::Wikidata'; |
5 | use_ok 'Catmandu::Fix::wdata_retain_language'; | |
6 | use_ok 'Catmandu::Fix::wdata_simplify_claims'; | |
5 | use_ok 'Catmandu::Fix::wd_language'; | |
6 | use_ok 'Catmandu::Fix::wd_simple_strings'; | |
7 | use_ok 'Catmandu::Fix::wd_simple_claims'; | |
8 | use_ok 'Catmandu::Fix::wd_simple'; | |
7 | 9 | use_ok 'Catmandu::Wikidata'; |
8 | 10 | |
9 | 11 | done_testing; |