Natools

Check-in [f9d16b99f9]
Login
Overview
Comment:smaz-tools: new primitive to filter substrings without enough counts
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: f9d16b99f9520b974efa185b4181d99bd78aa2fd
User & Date: nat on 2016-10-20 20:15:38
Other Links: manifest | tags
Context
2016-10-21
19:32
tools/smaz: new command-line option to filter substrings by count check-in: 637ebd90fa user: nat tags: trunk
2016-10-20
20:15
smaz-tools: new primitive to filter substrings without enough counts check-in: f9d16b99f9 user: nat tags: trunk
2016-10-19
20:24
tools/smaz: use the new trie-based search in dictionary evaluation check-in: 105a5395c6 user: nat tags: trunk
Changes

Modified src/natools-smaz-tools.adb from [5167279c60] to [bad9cd1aa0].

673
674
675
676
677
678
679






















680
681
682
683
684
685
686
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708







+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







            end if;

            Index := Index + Verbatim_Length + 1;
         end if;
      end loop;
   end Evaluate_Dictionary_Partial;


   procedure Filter_By_Count
     (Counter : in out Word_Counter;
      Threshold_Count : in String_Count)
   is
      Position, Next : Word_Maps.Cursor;
   begin
      Position := Word_Maps.First (Counter.Map);

      while Word_Maps.Has_Element (Position) loop
         Next := Word_Maps.Next (Position);

         if Word_Maps.Element (Position) < Threshold_Count then
            Word_Maps.Delete (Counter.Map, Position);
         end if;

         Position := Next;
      end loop;

      pragma Assert (for all Count of Counter.Map => Count >= Threshold_Count);
   end Filter_By_Count;


   function Simple_Dictionary
     (Counter : in Word_Counter;
      Word_Count : in Natural)
     return String_Lists.List
   is
      use type Ada.Containers.Count_Type;

Modified src/natools-smaz-tools.ads from [5b71d7ec1d] to [d2634b59f6].

111
112
113
114
115
116
117





118
119
120
121
122
123
124
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129







+
+
+
+
+







      Phrase : in String;
      Min_Size : in Positive;
      Max_Size : in Positive);
      --  Add the "words" from Phrase into Counter, with a word being currently
      --  defined as anything between ASCII blanks or punctuation,
      --  or in other words [0-9A-Za-z\x80-\xFF]+

   procedure Filter_By_Count
     (Counter : in out Word_Counter;
      Threshold_Count : in String_Count);
      --  Remove from Counter all entries whose count is below the threshold

   function Simple_Dictionary
     (Counter : in Word_Counter;
      Word_Count : in Natural)
     return String_Lists.List;
      --  Return the Word_Count words in Counter that have the highest score,
      --  the score being count * length.