From bb77d2c38b27537f28f9ac0afa2585df7f345e7f Mon Sep 17 00:00:00 2001 From: Michael Brown Date: Fri, 7 Jun 2013 14:47:07 -0400 Subject: [PATCH] More entropy for foreign titles * Treat strings with non-ASCII characters as having more entropy --- lib/text_sentinel.rb | 4 +++- spec/components/text_sentinel_spec.rb | 10 +++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/lib/text_sentinel.rb b/lib/text_sentinel.rb index 230791e62df..cb0d36393d8 100644 --- a/lib/text_sentinel.rb +++ b/lib/text_sentinel.rb @@ -21,8 +21,10 @@ class TextSentinel end # Entropy is a number of how many unique characters the string needs. + # Non-ASCII characters are weighted heavier since they contain more "information" def entropy - @entropy ||= @text.to_s.strip.split('').uniq.size + chars = @text.to_s.strip.split('') + @entropy ||= chars.pack('M*'*chars.size).gsub("\n",'').split('=').uniq.size end def valid? diff --git a/spec/components/text_sentinel_spec.rb b/spec/components/text_sentinel_spec.rb index cf9e211efae..669dc837fa6 100644 --- a/spec/components/text_sentinel_spec.rb +++ b/spec/components/text_sentinel_spec.rb @@ -32,7 +32,15 @@ describe TextSentinel do end it "Works on foreign characters" do - TextSentinel.new("去年十社會警告").entropy.should == 7 + TextSentinel.new("去年十社會警告").entropy.should == 19 + end + + it "generates enough entropy for short foreign strings" do + TextSentinel.new("又一个测").entropy.should == 11 + end + + it "handles repeated foreign characters" do + TextSentinel.new("又一个测试话题" * 3).entropy.should == 18 end end