Import upstream version 0.2.0+git20220203.1.b1f7f52
Debian Janitor
1 year, 3 months ago
0 | PATH | |
1 | remote: . | |
2 | specs: | |
3 | numerizer (0.2.0) | |
4 | ||
5 | GEM | |
6 | remote: https://rubygems.org/ | |
7 | specs: | |
8 | minitest (5.15.0) | |
9 | rake (13.0.6) | |
10 | ||
11 | PLATFORMS | |
12 | ruby | |
13 | ||
14 | DEPENDENCIES | |
15 | minitest (~> 5.0) | |
16 | numerizer! | |
17 | rake (~> 13) | |
18 | ||
19 | BUNDLED WITH | |
20 | 2.3.6 |
0 | 0 | = Numerizer |
1 | {<img src="https://github.com/jduff/numerizer/actions/workflows/test.yaml/badge.svg?branch=master" alt="Build Status" />}[https://github.com/jduff/numerizer/actions/workflows/test.yaml?query=branch%3Amaster] | |
1 | 2 | |
2 | Numerizer is a gem to help with parsing numbers in natural language from strings (ex forty two). It was extracted from the awesome Chronic gem http://github.com/evaryont/chronic. | |
3 | Numerizer is a gem to help with parsing numbers in natural language from strings (ex forty two). It was extracted from the awesome Chronic gem https://github.com/mojombo/chronic. | |
3 | 4 | |
4 | 5 | == Installation |
5 | 6 | |
6 | $ sudo gem sources -a http://gemcutter.org | |
7 | $ sudo gem install numerizer | |
7 | $ gem install numerizer | |
8 | 8 | |
9 | 9 | == Usage |
10 | 10 | |
20 | 20 | => "2.375" |
21 | 21 | |
22 | 22 | == Contributors |
23 | Tom Preston-Werner, John Duff⏎ | |
23 | Tom Preston-Werner, John Duff |
0 | 0 | require 'rubygems' |
1 | 1 | require 'rake' |
2 | 2 | |
3 | begin | |
4 | require 'jeweler' | |
5 | Jeweler::Tasks.new do |gem| | |
6 | gem.name = "numerizer" | |
7 | gem.summary = "Numerizer is a gem to help with parsing numbers in natural language from strings (ex forty two)." | |
8 | gem.description = "Numerizer is a gem to help with parsing numbers in natural language from strings (ex forty two). It was extracted from the awesome Chronic gem http://github.com/evaryont/chronic." | |
9 | gem.email = "duff.john@gmail.com" | |
10 | gem.homepage = "http://github.com/jduff/numerizer" | |
11 | gem.license = 'MIT' | |
12 | gem.authors = ["John Duff"] | |
13 | # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings | |
14 | end | |
15 | Jeweler::GemcutterTasks.new | |
16 | rescue LoadError | |
17 | puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler" | |
3 | $:.unshift File.expand_path('../lib', __FILE__) | |
4 | require 'numerizer/version' | |
5 | ||
6 | def version | |
7 | Numerizer::VERSION | |
18 | 8 | end |
19 | 9 | |
20 | 10 | require 'rake/testtask' |
36 | 26 | end |
37 | 27 | end |
38 | 28 | |
39 | # task :test => :check_dependencies | |
29 | desc "Release Numerizer version #{version}" | |
30 | task :release => :build do | |
31 | unless `git branch` =~ /^\* master$/ | |
32 | puts "You must be on the master branch to release!" | |
33 | exit! | |
34 | end | |
35 | sh "git commit --allow-empty -a -m 'Release #{version}'" | |
36 | sh "git tag v#{version}" | |
37 | sh "git push origin master" | |
38 | sh "git push origin v#{version}" | |
39 | sh "gem push pkg/numerizer-#{version}.gem" | |
40 | end | |
41 | ||
42 | desc 'Build a gem from the gemspec' | |
43 | task :build do | |
44 | FileUtils.mkdir_p 'pkg' | |
45 | sh 'gem build numerizer.gemspec' | |
46 | FileUtils.mv("./numerizer-#{version}.gem", "pkg") | |
47 | end | |
48 | ||
40 | 49 | |
41 | 50 | task :default => :test |
42 | ||
43 | # require 'rake/rdoctask' | |
44 | # Rake::RDocTask.new do |rdoc| | |
45 | # if File.exist?('VERSION') | |
46 | # version = File.read('VERSION') | |
47 | # else | |
48 | # version = "" | |
49 | # end | |
50 | ||
51 | # rdoc.rdoc_dir = 'rdoc' | |
52 | # rdoc.title = "numerizer #{version}" | |
53 | # rdoc.rdoc_files.include('README*') | |
54 | # rdoc.rdoc_files.include('lib/**/*.rb') | |
55 | # end |
9 | 9 | # |
10 | 10 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
11 | 11 | |
12 | require 'strscan' | |
12 | require 'numerizer/version' | |
13 | require 'providers/english_provider' | |
14 | require 'set' | |
13 | 15 | |
14 | 16 | class Numerizer |
15 | 17 | |
16 | DIRECT_NUMS = [ | |
17 | ['eleven', '11'], | |
18 | ['twelve', '12'], | |
19 | ['thirteen', '13'], | |
20 | ['fourteen', '14'], | |
21 | ['fifteen', '15'], | |
22 | ['sixteen', '16'], | |
23 | ['seventeen', '17'], | |
24 | ['eighteen', '18'], | |
25 | ['nineteen', '19'], | |
26 | ['ninteen', '19'], # Common mis-spelling | |
27 | ['zero', '0'], | |
28 | ['ten', '10'], | |
29 | ['\ba[\b^$]', '1'] # doesn't make sense for an 'a' at the end to be a 1 | |
30 | ] | |
18 | PROVIDERS = {'en' => EnglishProvider.new} | |
31 | 19 | |
32 | SINGLE_NUMS = [ | |
33 | ['one', 1], | |
34 | ['two', 2], | |
35 | ['three', 3], | |
36 | ['four', 4], | |
37 | ['five', 5], | |
38 | ['six', 6], | |
39 | ['seven', 7], | |
40 | ['eight', 8], | |
41 | ['nine', 9] | |
42 | ] | |
43 | ||
44 | TEN_PREFIXES = [ | |
45 | ['twenty', 20], | |
46 | ['thirty', 30], | |
47 | ['forty', 40], | |
48 | ['fourty', 40], # Common misspelling | |
49 | ['fifty', 50], | |
50 | ['sixty', 60], | |
51 | ['seventy', 70], | |
52 | ['eighty', 80], | |
53 | ['ninety', 90] | |
54 | ] | |
55 | ||
56 | BIG_PREFIXES = [ | |
57 | ['hundred', 100], | |
58 | ['thousand', 1000], | |
59 | ['million', 1_000_000], | |
60 | ['billion', 1_000_000_000], | |
61 | ['trillion', 1_000_000_000_000], | |
62 | ] | |
63 | ||
64 | FRACTIONS = [ | |
65 | ['half', 2], | |
66 | ['third(s)?', 3], | |
67 | ['fourth(s)?', 4], | |
68 | ['quarter(s)?', 4], | |
69 | ['fifth(s)?', 5], | |
70 | ['sixth(s)?', 6], | |
71 | ['seventh(s)?', 7], | |
72 | ['eighth(s)?', 8], | |
73 | ['nineth(s)?', 9], | |
74 | ] | |
75 | ||
76 | SINGLE_ORDINALS = [ | |
77 | ['first', 1], | |
78 | ['third', 3], | |
79 | ['fourth', 4], | |
80 | ['fifth', 5], | |
81 | ['sixth', 6], | |
82 | ['seventh', 7], | |
83 | ['eighth', 8], | |
84 | ['ninth', 9] | |
85 | ] | |
86 | ||
87 | DIRECT_ORDINALS = [ | |
88 | ['tenth', '10'], | |
89 | ['eleventh', '11'], | |
90 | ['twelfth', '12'], | |
91 | ['thirteenth', '13'], | |
92 | ['fourteenth', '14'], | |
93 | ['fifteenth', '15'], | |
94 | ['sixteenth', '16'], | |
95 | ['seventeenth', '17'], | |
96 | ['eighteenth', '18'], | |
97 | ['nineteenth', '19'], | |
98 | ['twentieth', '20'], | |
99 | ['thirtieth', '30'], | |
100 | ['fourtieth', '40'], | |
101 | ['fiftieth', '50'], | |
102 | ['sixtieth', '60'], | |
103 | ['seventieth', '70'], | |
104 | ['eightieth', '80'], | |
105 | ['ninetieth', '90'] | |
106 | ] | |
107 | ||
108 | def self.numerize(string) | |
20 | def self.numerize(string,lang: 'en', ignore: [], bias: :none) | |
109 | 21 | string = string.dup |
110 | ||
111 | # preprocess | |
112 | string.gsub!(/ +|([^\d])-([^\d])/, '\1 \2') # will mutilate hyphenated-words | |
113 | ||
114 | # easy/direct replacements | |
115 | (DIRECT_NUMS + SINGLE_NUMS).each do |dn| | |
116 | string.gsub!(/(^|\W)#{dn[0]}(?=$|\W)/i, '\1<num>' + dn[1].to_s) | |
22 | ignore = ignore.map(&:downcase).to_set | |
23 | provider = PROVIDERS[lang] | |
24 | if provider == nil | |
25 | raise "Language #{lang} not found. Language options include #{PROVIDERS.keys}" | |
117 | 26 | end |
118 | ||
119 | # ten, twenty, etc. | |
120 | TEN_PREFIXES.each do |tp| | |
121 | SINGLE_NUMS.each do |dn| | |
122 | string.gsub!(/(^|\W)#{tp[0]}#{dn[0]}(?=$|\W)/i, '\1<num>' + (tp[1] + dn[1]).to_s) | |
123 | end | |
124 | SINGLE_ORDINALS.each do |dn| | |
125 | string.gsub!(/(^|\W)#{tp[0]}(\s)?#{dn[0]}(?=$|\W)/i, '\1<num>' + (tp[1] + dn[1]).to_s + dn[0][-2, 2]) | |
126 | end | |
127 | string.gsub!(/(^|\W)#{tp[0]}(?=$|\W)/i, '\1<num>' + tp[1].to_s) | |
128 | end | |
129 | ||
130 | # handle fractions | |
131 | FRACTIONS.each do |tp| | |
132 | string.gsub!(/a #{tp[0]}(?=$|\W)/i, '<num>1/' + tp[1].to_s) | |
133 | string.gsub!(/\s#{tp[0]}(?=$|\W)/i, '/' + tp[1].to_s) | |
134 | end | |
135 | ||
136 | (DIRECT_ORDINALS + SINGLE_ORDINALS).each do |on| | |
137 | string.gsub!(/(^|\W)#{on[0]}(?=$|\W)/i, '\1<num>' + on[1].to_s + on[0][-2, 2]) | |
138 | end | |
139 | ||
140 | # evaluate fractions when preceded by another number | |
141 | string.gsub!(/(\d+)(?: | and |-)+(<num>|\s)*(\d+)\s*\/\s*(\d+)/i) { ($1.to_f + ($3.to_f/$4.to_f)).to_s } | |
142 | ||
143 | # hundreds, thousands, millions, etc. | |
144 | BIG_PREFIXES.each do |bp| | |
145 | string.gsub!(/(?:<num>)?(\d*) *#{bp[0]}/i) { $1.empty? ? bp[1] : '<num>' + (bp[1] * $1.to_i).to_s } | |
146 | andition(string) | |
147 | end | |
148 | ||
149 | andition(string) | |
150 | ||
151 | string.gsub(/<num>/, '') | |
152 | end | |
153 | ||
154 | class << self | |
155 | private | |
156 | def andition(string) | |
157 | sc = StringScanner.new(string) | |
158 | while(sc.scan_until(/<num>(\d+)( | and )<num>(\d+)(?=[^\w]|$)/i)) | |
159 | if sc[2] =~ /and/ || sc[1].size > sc[3].size | |
160 | string[(sc.pos - sc.matched_size)..(sc.pos-1)] = '<num>' + (sc[1].to_i + sc[3].to_i).to_s | |
161 | sc.reset | |
162 | end | |
163 | end | |
164 | end | |
27 | provider.numerize(string, ignore: ignore, bias: bias) | |
165 | 28 | end |
166 | 29 | |
167 | 30 | end |
0 | class GenericProvider | |
1 | ||
2 | def numerize(str, ignore: [], bias: :none) | |
3 | preprocess(str, ignore) | |
4 | numerize_numerals(str, ignore, bias) | |
5 | numerize_fractions(str, ignore, bias) | |
6 | numerize_ordinals(str, ignore, bias) | |
7 | numerize_big_prefixes(str, ignore, bias) | |
8 | postprocess(str, ignore) | |
9 | end | |
10 | ||
11 | private | |
12 | ||
13 | def preprocess(str, ignore) | |
14 | raise 'must be implemented in subclass' | |
15 | end | |
16 | def numerize_numerals(str, ignore, bias) | |
17 | raise 'must be implemented in subclass' | |
18 | end | |
19 | def numerize_fractions(str, ignore, bias) | |
20 | raise 'must be implemented in subclass' | |
21 | end | |
22 | def numerize_ordinals(str, ignore, bias) | |
23 | raise 'must be implemented in subclass' | |
24 | end | |
25 | def numerize_big_prefixes(str, ignore, bias) | |
26 | raise 'must be implemented in subclass' | |
27 | end | |
28 | def postprocess(str, ignore) | |
29 | raise 'must be implemented in subclass' | |
30 | end | |
31 | ||
32 | # Turns list of words into a unionized list, ignoring words specified in | |
33 | # arguments or that meet the conditions of the yield block | |
34 | def regexify(words, ignore:[]) | |
35 | if block_given? | |
36 | return Regexp.union(words.reject { |x| ignore.include?(x) || yield(x) }) | |
37 | else | |
38 | return Regexp.union(words.reject { |x| ignore.include?(x) }) | |
39 | end | |
40 | end | |
41 | ||
42 | end |
0 | require 'provider' | |
1 | require 'strscan' | |
2 | ||
3 | class EnglishProvider < GenericProvider | |
4 | ||
5 | DIRECT_NUMS = { | |
6 | 'eleven' => '11', | |
7 | 'twelve' => '12', | |
8 | 'thirteen' => '13', | |
9 | 'fourteen' => '14', | |
10 | 'fifteen' => '15', | |
11 | 'sixteen' => '16', | |
12 | 'seventeen' => '17', | |
13 | 'eighteen' => '18', | |
14 | 'nineteen' => '19', | |
15 | 'ninteen' => '19', | |
16 | 'zero' => '0', | |
17 | 'ten' => '10', | |
18 | } | |
19 | ||
20 | SINGLE_NUMS = { | |
21 | 'one' => 1, | |
22 | 'two' => 2, | |
23 | 'three' => 3, | |
24 | 'four' => 4, | |
25 | 'five' => 5, | |
26 | 'six' => 6, | |
27 | 'seven' => 7, | |
28 | 'eight' => 8, | |
29 | 'nine' => 9 | |
30 | } | |
31 | ||
32 | TEN_PREFIXES = { | |
33 | 'twenty' => 20, | |
34 | 'thirty' => 30, | |
35 | 'forty' => 40, | |
36 | 'fourty' => 40, | |
37 | 'fifty' => 50, | |
38 | 'sixty' => 60, | |
39 | 'seventy' => 70, | |
40 | 'eighty' => 80, | |
41 | 'ninety' => 90 | |
42 | } | |
43 | ||
44 | BIG_PREFIXES = { | |
45 | 'hundred' => 100, | |
46 | 'thousand' => 1000, | |
47 | 'million' => 1_000_000, | |
48 | 'billion' => 1_000_000_000, | |
49 | 'trillion' => 1_000_000_000_000, | |
50 | } | |
51 | ||
52 | FRACTIONS = { | |
53 | 'half' => 2, | |
54 | 'halves' => 2, | |
55 | 'quarter' => 4, | |
56 | 'quarters' => 4 | |
57 | } | |
58 | ||
59 | ORDINALS = { | |
60 | 'first' => 1, | |
61 | 'second' => 2, | |
62 | } | |
63 | ||
64 | SINGLE_ORDINAL_FRACTIONALS = { | |
65 | 'third' => 3, | |
66 | 'fourth' => 4, | |
67 | 'fifth' => 5, | |
68 | 'sixth' => 6, | |
69 | 'seventh' => 7, | |
70 | 'eighth' => 8, | |
71 | 'ninth' => 9, | |
72 | } | |
73 | ||
74 | DIRECT_ORDINAL_FRACTIONALS = { | |
75 | 'tenth' => '10', | |
76 | 'eleventh' => '11', | |
77 | 'twelfth' => '12', | |
78 | 'thirteenth' => '13', | |
79 | 'fourteenth' => '14', | |
80 | 'fifteenth' => '15', | |
81 | 'sixteenth' => '16', | |
82 | 'seventeenth' => '17', | |
83 | 'eighteenth' => '18', | |
84 | 'nineteenth' => '19', | |
85 | 'twentieth' => '20', | |
86 | 'thirtieth' => '30', | |
87 | 'fourtieth' => '40', | |
88 | 'fiftieth' => '50', | |
89 | 'sixtieth' => '60', | |
90 | 'seventieth' => '70', | |
91 | 'eightieth' => '80', | |
92 | 'ninetieth' => '90' | |
93 | } | |
94 | ||
95 | ALL_ORDINALS = ORDINALS.merge(SINGLE_ORDINAL_FRACTIONALS).merge(DIRECT_ORDINAL_FRACTIONALS) | |
96 | ONLY_PLURAL_FRACTIONS = FRACTIONS.merge((SINGLE_ORDINAL_FRACTIONALS.merge(DIRECT_ORDINAL_FRACTIONALS)).inject({ }) {|h, (k,v)| h[k + 's'] = v ; h}) | |
97 | ALL_FRACTIONS = ONLY_PLURAL_FRACTIONS.merge(SINGLE_ORDINAL_FRACTIONALS).merge(DIRECT_ORDINAL_FRACTIONALS) | |
98 | ||
99 | DIRECT_SINGLE_NUMS = DIRECT_NUMS.merge(SINGLE_NUMS) | |
100 | DIRECT_NUMS_TEN_PREFIXES = DIRECT_NUMS.merge(TEN_PREFIXES) | |
101 | ORDINAL_SINGLE = ORDINALS.merge(SINGLE_ORDINAL_FRACTIONALS) | |
102 | ||
103 | # REGEXP.UNION here breaks insertion into negative Lookbehind | |
104 | ALL_ORDINALS_REGEX = ALL_ORDINALS.keys.reduce {|a,b| a + '|' + b} | |
105 | PRONOUNS = ['i','you','he','she','we','it','you','they','to','the'].reduce {|a,b| a + '|' + b} | |
106 | ||
107 | def preprocess(string, ignore) | |
108 | string.gsub!(/ +|([^\d])-([^\d])/, '\1 \2') # will mutilate hyphenated-words | |
109 | string.gsub!(/\ba$/, '') && string.rstrip! # doesn't make sense for an 'a' at the end to be a 1 | |
110 | end | |
111 | ||
112 | def numerize_numerals(string, ignore, bias) | |
113 | single_nums = regexify(SINGLE_NUMS.keys, ignore: ignore) | |
114 | dir_single_nums = regexify(DIRECT_SINGLE_NUMS.keys, ignore: ignore) | |
115 | ten_prefs = regexify(TEN_PREFIXES.keys, ignore: ignore) | |
116 | dir_nums_ten_prefs = regexify(DIRECT_NUMS_TEN_PREFIXES.keys, ignore: ignore) | |
117 | single_ords = regexify(ORDINAL_SINGLE.keys, ignore: ignore) | |
118 | ||
119 | # easy/direct replacements | |
120 | string.gsub!(/(^|\W)(#{single_nums})\s(#{dir_nums_ten_prefs})(?=$|\W)/i) {$1 << $2 << ' hundred ' << $3} | |
121 | string.gsub!(/(^|\W)(#{dir_single_nums})(?=$|\W)/i) { $1 << '<num>' << DIRECT_SINGLE_NUMS[$2].to_s} | |
122 | if bias == :ordinal | |
123 | string.gsub!(/(^|\W)\ba\b(?=$|\W)(?! (?:#{ALL_ORDINALS_REGEX}))/i, '\1<num>' + 1.to_s) | |
124 | else | |
125 | string.gsub!(/(^|\W)\ba\b(?=$|\W)/i, '\1<num>' + 1.to_s) | |
126 | end | |
127 | ||
128 | # ten, twenty, etc. | |
129 | string.gsub!(/(^|\W)(#{ten_prefs})(#{single_nums})(?=$|\W)/i) { $1 << '<num>' << (TEN_PREFIXES[$2] + SINGLE_NUMS[$3]).to_s} | |
130 | string.gsub!(/(^|\W)(#{ten_prefs})(\s)?(#{single_ords})(?=$|\W)/i) { $1 << '<num>' << (TEN_PREFIXES[$2] + ORDINAL_SINGLE[$4]).to_s << $4[-2, 2]} | |
131 | string.gsub!(/(^|\W)(#{ten_prefs})(?=$|\W)/i) { $1 << '<num>' << TEN_PREFIXES[$2].to_s} | |
132 | end | |
133 | ||
134 | def numerize_fractions(string, ignore, bias) | |
135 | # handle fractions | |
136 | # only plural fractions if ordinal mode | |
137 | # Ignore quarter to be handled seperately if not fractional mode | |
138 | if bias == :ordinal | |
139 | fractionals = regexify(ONLY_PLURAL_FRACTIONS.keys, ignore: ignore + ['quarter', 'quarters']) | |
140 | elsif bias == :fractional | |
141 | fractionals = regexify(ALL_FRACTIONS.keys, ignore: ignore) | |
142 | else | |
143 | fractionals = regexify(ALL_FRACTIONS.keys, ignore: ignore + ['quarter', 'quarters']) | |
144 | end | |
145 | quarters = regexify(['quarter', 'quarters'], ignore: ignore) | |
146 | ||
147 | string.gsub!(/a (#{fractionals})(?=$|\W)/i) {'<num>1/' << ALL_FRACTIONS[$1].to_s} | |
148 | # TODO : Find Noun Distinction for Quarter | |
149 | if bias == :fractional | |
150 | string.gsub!(/(^|\W)(#{fractionals})(?=$|\W)/i) {'/' << ALL_FRACTIONS[$2].to_s} | |
151 | else | |
152 | string.gsub!(/(?<!the|^)(\W)(#{fractionals})(?=$|\W)/i) { '/' << ALL_FRACTIONS[$2].to_s } | |
153 | string.gsub!(/(?<!#{PRONOUNS})(^|\W)(#{quarters})(?=$|\W)/i) { '/' << ALL_FRACTIONS[$2].to_s } | |
154 | end | |
155 | cleanup_fractions(string) | |
156 | end | |
157 | ||
158 | ||
159 | def numerize_ordinals(string, ignore, bias) | |
160 | return if bias == :fractionals | |
161 | all_ords = regexify(ALL_ORDINALS.keys, ignore: ignore) {|x| x == 'second' && bias != :ordinal } | |
162 | if bias != :ordinal && !ignore.include?('second') | |
163 | string.gsub!(/(?<!second|\d|#{ALL_ORDINALS_REGEX})(^|\W)second(?=$|\W)/i) { $1 << '<num>' << ALL_ORDINALS['second'].to_s << 'second'[-2, 2] } | |
164 | end | |
165 | string.gsub!(/(^|\W)(#{all_ords})(?=$|\W)/i) { $1 << '<num>' << ALL_ORDINALS[$2].to_s << $2[-2, 2]} | |
166 | end | |
167 | ||
168 | # hundreds, thousands, millions, etc. | |
169 | def numerize_big_prefixes(string, ignore, bias) | |
170 | # big_prefs = regexify(BIG_PREFIXES.keys, ignore: ignore) | |
171 | BIG_PREFIXES.each do |k,v| | |
172 | next if ignore.include? k.downcase | |
173 | string.gsub!(/(?:<num>)?(\d*) *#{k}/i) { $1.empty? ? v : '<num>' << (v * $1.to_i).to_s } | |
174 | andition(string) | |
175 | end | |
176 | end | |
177 | ||
178 | def postprocess(string, ignore) | |
179 | andition(string) | |
180 | numerize_halves(string, ignore) | |
181 | #Strip Away Added Num Tags | |
182 | string.gsub(/<num>/, '') | |
183 | end | |
184 | ||
185 | private | |
186 | ||
187 | def cleanup_fractions(string) | |
188 | # evaluate fractions when preceded by another number | |
189 | string.gsub!(/(\d+)(?: | and |-)+(<num>|\s)*(\d+)\s*\/\s*(\d+)/i) { ($1.to_f + ($3.to_f/$4.to_f)).to_s } | |
190 | # fix unpreceeded fractions | |
191 | string.gsub!(/(?:^|\W)\/(\d+)/, '1/\1') | |
192 | string.gsub!(/(?<=[a-zA-Z])\/(\d+)/, ' 1/\1') | |
193 | end | |
194 | ||
195 | # always substitute halfs | |
196 | def numerize_halves(string, ignore) | |
197 | return if ignore.include? 'half' | |
198 | string.gsub!(/\bhalf\b/i, '1/2') | |
199 | end | |
200 | ||
201 | def andition(string) | |
202 | sc = StringScanner.new(string) | |
203 | while(sc.scan_until(/<num>(\d+)( | and )<num>(\d+)(?=[^\w]|$)/i)) | |
204 | if sc[2] =~ /and/ || sc[1].size > sc[3].size | |
205 | string[(sc.pos - sc.matched_size)..(sc.pos-1)] = '<num>' << (sc[1].to_i + sc[3].to_i).to_s | |
206 | sc.reset | |
207 | end | |
208 | end | |
209 | end | |
210 | ||
211 | end |
0 | # Generated by jeweler | |
1 | # DO NOT EDIT THIS FILE DIRECTLY | |
2 | # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec' | |
3 | # -*- encoding: utf-8 -*- | |
4 | # stub: numerizer 0.2.0 ruby lib | |
0 | $:.unshift File.expand_path('../lib', __FILE__) | |
1 | require 'numerizer/version' | |
5 | 2 | |
6 | 3 | Gem::Specification.new do |s| |
7 | 4 | s.name = "numerizer" |
8 | s.version = "0.2.0" | |
5 | s.version = Numerizer::VERSION | |
9 | 6 | |
10 | 7 | s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version= |
11 | 8 | s.require_paths = ["lib"] |
17 | 14 | "LICENSE", |
18 | 15 | "README.rdoc" |
19 | 16 | ] |
20 | s.files = [ | |
21 | ".document", | |
22 | "LICENSE", | |
23 | "README.rdoc", | |
24 | "Rakefile", | |
25 | "VERSION", | |
26 | "lib/numerizer.rb", | |
27 | "numerizer.gemspec", | |
28 | "test/test_helper.rb", | |
29 | "test/test_numerizer.rb" | |
30 | ] | |
17 | s.files = `git ls-files`.split($/) | |
18 | s.test_files = `git ls-files -- test`.split($/) | |
31 | 19 | s.homepage = "http://github.com/jduff/numerizer" |
32 | 20 | s.licenses = ["MIT"] |
33 | 21 | s.rubygems_version = "2.2.2" |
34 | 22 | s.summary = "Numerizer is a gem to help with parsing numbers in natural language from strings (ex forty two)." |
23 | ||
24 | s.add_development_dependency 'rake', '~> 13' | |
25 | s.add_development_dependency 'minitest', '~> 5.0' | |
35 | 26 | end |
36 | 27 |
0 | require 'rubygems' | |
1 | require 'test/unit' | |
2 | ||
3 | 0 | $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib')) |
4 | 1 | $LOAD_PATH.unshift(File.dirname(__FILE__)) |
5 | 2 | require 'numerizer' |
6 | 3 | |
7 | class Test::Unit::TestCase | |
4 | require 'minitest/autorun' | |
5 | ||
6 | class TestCase < Minitest::Test | |
8 | 7 | end |
0 | require File.join(File.dirname(__FILE__), 'test_helper') | |
1 | ||
2 | class NumerizerTest < Test::Unit::TestCase | |
3 | def test_straight_parsing | |
4 | strings = { | |
5 | 1 => 'one', | |
6 | 5 => 'five', | |
7 | 10 => 'ten', | |
8 | 11 => 'eleven', | |
9 | 12 => 'twelve', | |
10 | 13 => 'thirteen', | |
11 | 14 => 'fourteen', | |
12 | 15 => 'fifteen', | |
13 | 16 => 'sixteen', | |
14 | 17 => 'seventeen', | |
15 | 18 => 'eighteen', | |
16 | 19 => 'nineteen', | |
17 | 20 => 'twenty', | |
18 | 27 => 'twenty seven', | |
19 | 31 => 'thirty-one', | |
20 | 37 => 'thirty-seven', | |
21 | 41 => 'forty one', | |
22 | 42 => 'fourty two', | |
23 | 59 => 'fifty nine', | |
24 | 100 => 'a hundred', | |
25 | 100 => 'one hundred', | |
26 | 150 => 'one hundred and fifty', | |
27 | # 150 => 'one fifty', | |
28 | 200 => 'two-hundred', | |
29 | 500 => '5 hundred', | |
30 | 999 => 'nine hundred and ninety nine', | |
31 | 1_000 => 'one thousand', | |
32 | 1_200 => 'twelve hundred', | |
33 | 1_200 => 'one thousand two hundred', | |
34 | 17_000 => 'seventeen thousand', | |
35 | 21_473 => 'twentyone-thousand-four-hundred-and-seventy-three', | |
36 | 74_002 => 'seventy four thousand and two', | |
37 | 99_999 => 'ninety nine thousand nine hundred ninety nine', | |
38 | 100_000 => '100 thousand', | |
39 | 250_000 => 'two hundred fifty thousand', | |
40 | 1_000_000 => 'one million', | |
41 | 1_250_007 => 'one million two hundred fifty thousand and seven', | |
42 | 1_000_000_000 => 'one billion', | |
43 | 1_000_000_001 => 'one billion and one' | |
44 | } | |
45 | ||
46 | strings.keys.sort.each do |key| | |
47 | assert_equal key, Numerizer.numerize(strings[key]).to_i | |
48 | end | |
49 | ||
50 | assert_equal "2.5", Numerizer.numerize("two and a half") | |
51 | assert_equal "1/2", Numerizer.numerize("one half") | |
52 | end | |
53 | ||
54 | def test_combined_double_digets | |
55 | assert_equal "21", Numerizer.numerize("twentyone") | |
56 | assert_equal "37", Numerizer.numerize("thirtyseven") | |
57 | end | |
58 | ||
59 | def test_fractions_in_words | |
60 | assert_equal "1/4", Numerizer.numerize("1 quarter") | |
61 | assert_equal "1/4", Numerizer.numerize("one quarter") | |
62 | assert_equal "1/4", Numerizer.numerize("a quarter") | |
63 | assert_equal "1/8", Numerizer.numerize("one eighth") | |
64 | ||
65 | assert_equal "3/4", Numerizer.numerize("three quarters") | |
66 | assert_equal "2/4", Numerizer.numerize("two fourths") | |
67 | assert_equal "3/8", Numerizer.numerize("three eighths") | |
68 | end | |
69 | ||
70 | def test_fractional_addition | |
71 | assert_equal "1.25", Numerizer.numerize("one and a quarter") | |
72 | assert_equal "2.375", Numerizer.numerize("two and three eighths") | |
73 | assert_equal "3.5 hours", Numerizer.numerize("three and a half hours") | |
74 | end | |
75 | ||
76 | def test_word_with_a_number | |
77 | assert_equal "pennyweight", Numerizer.numerize("pennyweight") | |
78 | end | |
79 | ||
80 | def test_edges | |
81 | assert_equal "27 Oct 2006 7:30am", Numerizer.numerize("27 Oct 2006 7:30am") | |
82 | end | |
83 | ||
84 | def test_multiple_slashes_should_not_be_evaluated | |
85 | assert_equal '11/02/2007', Numerizer.numerize('11/02/2007') | |
86 | end | |
87 | ||
88 | def test_compatability | |
89 | assert_equal '1/2', Numerizer.numerize('1/2') | |
90 | assert_equal '05/06', Numerizer.numerize('05/06') | |
91 | assert_equal "3.5 hours", Numerizer.numerize("three and a half hours") | |
92 | end | |
93 | ||
94 | def test_ordinal_strings | |
95 | { | |
96 | 'first' => '1st', | |
97 | 'second' => 'second', | |
98 | 'third' => '3rd', | |
99 | 'fifth' => '5th', | |
100 | 'seventh' => '7th', | |
101 | 'eighth' => '8th', | |
102 | 'tenth' => '10th', | |
103 | 'eleventh' => '11th', | |
104 | 'twelfth' => '12th', | |
105 | 'thirteenth' => '13th', | |
106 | 'sixteenth' => '16th', | |
107 | 'twentieth' => '20th', | |
108 | 'twenty-third' => '23rd', | |
109 | 'thirtieth' => '30th', | |
110 | 'thirty-first' => '31st', | |
111 | 'fourtieth' => '40th', | |
112 | 'fourty ninth' => '49th', | |
113 | 'fiftieth' => '50th', | |
114 | 'sixtieth' => '60th', | |
115 | 'seventieth' => '70th', | |
116 | 'eightieth' => '80th', | |
117 | 'ninetieth' => '90th', | |
118 | 'hundredth' => '100th', | |
119 | 'thousandth' => '1000th', | |
120 | 'millionth' => '1000000th', | |
121 | 'billionth' => '1000000000th', | |
122 | 'trillionth' => '1000000000000th', | |
123 | 'first day month two' => '1st day month 2' | |
124 | }.each do |key, val| | |
125 | assert_equal val, Numerizer.numerize(key) | |
126 | end | |
127 | end | |
128 | ||
129 | end |
0 | require File.join(File.dirname(__FILE__), 'test_helper') | |
1 | ||
2 | class NumerizerTestEN < TestCase | |
3 | def test_en_argument | |
4 | assert_equal '12', Numerizer.numerize('twelve', lang: 'en') | |
5 | assert_raises RuntimeError do | |
6 | Numerizer.numerize('twelve', lang: 'english') | |
7 | end | |
8 | end | |
9 | ||
10 | def test_straight_parsing | |
11 | strings = { | |
12 | 1 => 'one', | |
13 | 5 => 'five', | |
14 | 10 => 'ten', | |
15 | 11 => 'eleven', | |
16 | 12 => 'twelve', | |
17 | 13 => 'thirteen', | |
18 | 14 => 'fourteen', | |
19 | 15 => 'fifteen', | |
20 | 16 => 'sixteen', | |
21 | 17 => 'seventeen', | |
22 | 18 => 'eighteen', | |
23 | 19 => 'nineteen', | |
24 | 20 => 'twenty', | |
25 | 27 => 'twenty seven', | |
26 | 31 => 'thirty-one', | |
27 | 37 => 'thirty-seven', | |
28 | 41 => 'forty one', | |
29 | 42 => 'fourty two', | |
30 | 59 => 'fifty nine', | |
31 | 100 => ['one hundred', 'a hundred', 'hundred a'], | |
32 | 150 => ['one hundred and fifty', 'one fifty'], | |
33 | 219 => ['two hundred and nineteen', 'two hundred nineteen', 'two nineteen'], | |
34 | 200 => 'two-hundred', | |
35 | 500 => '5 hundred', | |
36 | 999 => 'nine hundred and ninety nine', | |
37 | 1_000 => 'one thousand', | |
38 | 1_200 => ['twelve hundred', 'one thousand two hundred'], | |
39 | 17_000 => 'seventeen thousand', | |
40 | 21_473 => 'twentyone-thousand-four-hundred-and-seventy-three', | |
41 | 74_002 => 'seventy four thousand and two', | |
42 | 99_999 => 'ninety nine thousand nine hundred ninety nine', | |
43 | 100_000 => '100 thousand', | |
44 | 250_000 => 'two hundred fifty thousand', | |
45 | 1_000_000 => 'one million', | |
46 | 1_250_007 => 'one million two hundred fifty thousand and seven', | |
47 | 1_000_000_000 => 'one billion', | |
48 | 1_000_000_001 => 'one billion and one' | |
49 | } | |
50 | ||
51 | strings.sort.each do |key, value| | |
52 | Array(value).each do |value| | |
53 | assert_equal key, Numerizer.numerize(value).to_i | |
54 | end | |
55 | end | |
56 | ||
57 | assert_equal "1/2", Numerizer.numerize("half") | |
58 | assert_equal "1/4", Numerizer.numerize("quarter") | |
59 | end | |
60 | ||
61 | def test_combined_double_digets | |
62 | assert_equal "21", Numerizer.numerize("twentyone") | |
63 | assert_equal "37", Numerizer.numerize("thirtyseven") | |
64 | end | |
65 | ||
66 | def test_fractions_in_words | |
67 | assert_equal "1/2", Numerizer.numerize("one half") | |
68 | ||
69 | assert_equal "1/4", Numerizer.numerize("1 quarter") | |
70 | assert_equal "1/4", Numerizer.numerize("one quarter") | |
71 | assert_equal "1/4", Numerizer.numerize("a quarter") | |
72 | assert_equal "1/8", Numerizer.numerize("one eighth") | |
73 | ||
74 | assert_equal "3/4", Numerizer.numerize("three quarters") | |
75 | assert_equal "2/4", Numerizer.numerize("two fourths") | |
76 | assert_equal "3/8", Numerizer.numerize("three eighths") | |
77 | assert_equal "7/10", Numerizer.numerize("seven tenths") | |
78 | end | |
79 | ||
80 | def test_fractional_addition | |
81 | assert_equal "1.25", Numerizer.numerize("one and a quarter") | |
82 | assert_equal "2.375", Numerizer.numerize("two and three eighths") | |
83 | assert_equal "2.5", Numerizer.numerize("two and a half") | |
84 | assert_equal "3.5 hours", Numerizer.numerize("three and a half hours") | |
85 | end | |
86 | ||
87 | def test_word_with_a_number | |
88 | assert_equal "pennyweight", Numerizer.numerize("pennyweight") | |
89 | end | |
90 | ||
91 | def test_edges | |
92 | assert_equal "27 Oct 2006 7:30am", Numerizer.numerize("27 Oct 2006 7:30am") | |
93 | end | |
94 | ||
95 | def test_multiple_slashes_should_not_be_evaluated | |
96 | assert_equal '11/02/2007', Numerizer.numerize('11/02/2007') | |
97 | end | |
98 | ||
99 | def test_compatability | |
100 | assert_equal '1/2', Numerizer.numerize('1/2') | |
101 | assert_equal '05/06', Numerizer.numerize('05/06') | |
102 | assert_equal "3.5 hours", Numerizer.numerize("three and a half hours") | |
103 | assert_equal "1/2 an hour", Numerizer.numerize("half an hour") | |
104 | end | |
105 | ||
106 | def test_ordinal_strings | |
107 | { | |
108 | 'first' => '1st', | |
109 | 'second' => '2nd', | |
110 | 'third' => '3rd', | |
111 | 'fourth' => '4th', | |
112 | 'fifth' => '5th', | |
113 | 'seventh' => '7th', | |
114 | 'eighth' => '8th', | |
115 | 'tenth' => '10th', | |
116 | 'eleventh' => '11th', | |
117 | 'twelfth' => '12th', | |
118 | 'thirteenth' => '13th', | |
119 | 'sixteenth' => '16th', | |
120 | 'twentieth' => '20th', | |
121 | 'twenty-third' => '23rd', | |
122 | 'thirtieth' => '30th', | |
123 | 'thirty-first' => '31st', | |
124 | 'fourtieth' => '40th', | |
125 | 'fourty ninth' => '49th', | |
126 | 'fiftieth' => '50th', | |
127 | 'sixtieth' => '60th', | |
128 | 'seventieth' => '70th', | |
129 | 'eightieth' => '80th', | |
130 | 'ninetieth' => '90th', | |
131 | 'hundredth' => '100th', | |
132 | 'thousandth' => '1000th', | |
133 | 'millionth' => '1000000th', | |
134 | 'billionth' => '1000000000th', | |
135 | 'trillionth' => '1000000000000th', | |
136 | 'first day month two' => '1st day month 2' | |
137 | }.each do |key, val| | |
138 | assert_equal val, Numerizer.numerize(key) | |
139 | end | |
140 | end | |
141 | ||
142 | def test_ambiguous_cases | |
143 | # Quarter ( Coin ) is Untested | |
144 | # Second ( Time / Verb ) is Untested | |
145 | assert_equal 'the 4th', Numerizer.numerize('the fourth') | |
146 | assert_equal '1/3 of', Numerizer.numerize('a third of') | |
147 | assert_equal '4th', Numerizer.numerize('fourth') | |
148 | assert_equal '2nd', Numerizer.numerize('second') | |
149 | assert_equal 'I quarter', Numerizer.numerize('I quarter') | |
150 | assert_equal 'You quarter', Numerizer.numerize('You quarter') | |
151 | assert_equal 'I want to quarter', Numerizer.numerize('I want to quarter') | |
152 | assert_equal 'the 1st 1/4', Numerizer.numerize('the first quarter') | |
153 | assert_equal '1/4 pound of beef', Numerizer.numerize('quarter pound of beef') | |
154 | assert_equal 'the 2nd second', Numerizer.numerize('the second second') | |
155 | assert_equal 'the 4th second', Numerizer.numerize('the fourth second') | |
156 | assert_equal '1 second', Numerizer.numerize('one second') | |
157 | ||
158 | # TODO: Find way to distinguish this verb | |
159 | # assert_equal 'I peel and quarter bananas', Numerizer.numerize('I peel and quarter bananas') | |
160 | end | |
161 | ||
162 | def test_ignore | |
163 | assert_equal 'the second day of march', Numerizer.numerize('the second day of march', ignore: ['second']) | |
164 | assert_equal 'quarter', Numerizer.numerize('quarter', ignore: ['quarter']) | |
165 | assert_equal 'the five guys', Numerizer.numerize('the five guys', ignore: ['five']) | |
166 | assert_equal 'the fifty 2', Numerizer.numerize('the fifty two', ignore: ['fifty']) | |
167 | end | |
168 | ||
169 | def test_bias_ordinal | |
170 | assert_equal '4th', Numerizer.numerize('fourth', bias: :ordinal) | |
171 | assert_equal '12th', Numerizer.numerize('twelfth', bias: :ordinal) | |
172 | assert_equal '2nd', Numerizer.numerize('second', bias: :ordinal) | |
173 | assert_equal 'the 4th', Numerizer.numerize('the fourth', bias: :ordinal) | |
174 | assert_equal '2.75', Numerizer.numerize('two and three fourths', bias: :ordinal) | |
175 | assert_equal '3/5', Numerizer.numerize('three fifths', bias: :ordinal) | |
176 | assert_equal 'a 4th of', Numerizer.numerize('a fourth of', bias: :ordinal) | |
177 | assert_equal 'I quarter your home', Numerizer.numerize('I quarter your home', bias: :ordinal) | |
178 | assert_equal 'the 1st 2nd 3rd', Numerizer.numerize('the first second third', bias: :ordinal) | |
179 | end | |
180 | ||
181 | def test_bias_fractional | |
182 | assert_equal '1/4', Numerizer.numerize('fourth', bias: :fractional) | |
183 | assert_equal '1/12', Numerizer.numerize('twelfth', bias: :fractional) | |
184 | assert_equal '2nd', Numerizer.numerize('second', bias: :fractional) | |
185 | assert_equal 'the 1/4', Numerizer.numerize('the fourth', bias: :fractional) | |
186 | assert_equal '2.75', Numerizer.numerize('two and three fourths', bias: :fractional) | |
187 | assert_equal '3/5', Numerizer.numerize('three fifths', bias: :fractional) | |
188 | assert_equal '1/4 of', Numerizer.numerize('a fourth of', bias: :fractional) | |
189 | assert_equal 'I 1/4 your home', Numerizer.numerize('I quarter your home', bias: :fractional) | |
190 | assert_equal 'the 1st second 1/3', Numerizer.numerize('the first second third', bias: :fractional) | |
191 | end | |
192 | end |