22 #include <unordered_map> 
   25 #include "UTF8StringSlice.hpp" 
   31   typedef UTF8StringSlice::LengthType LengthType;
 
   39   void Extract(
const std::string& text) {
 
   43     CalculateSuffixEntropy();
 
   46     CalculatePrefixEntropy();
 
   48     ExtractWordCandidates();
 
   53   void SetFullText(
const std::string& fullText) {
 
   57   void SetFullText(
const char* fullText) {
 
   61   void SetFullText(
const UTF8StringSlice& fullText) { utf8FullText = fullText; }
 
   63   void SetWordMinLength(
const LengthType _wordMinLength) {
 
   64     wordMinLength = _wordMinLength;
 
   67   void SetWordMaxLength(
const LengthType _wordMaxLength) {
 
   68     wordMaxLength = _wordMaxLength;
 
   71   void SetPrefixSetLength(
const LengthType _prefixSetLength) {
 
   72     prefixSetLength = _prefixSetLength;
 
   75   void SetSuffixSetLength(
const LengthType _suffixSetLength) {
 
   76     suffixSetLength = _suffixSetLength;
 
   80   void SetPreCalculationFilter(
 
   83     preCalculationFilter = filter;
 
   86   void SetPostCalculationFilter(
 
   89     postCalculationFilter = filter;
 
   92   void ReleaseSuffixes() { std::vector<UTF8StringSlice8Bit>().swap(suffixes); }
 
   94   void ReleasePrefixes() { std::vector<UTF8StringSlice8Bit>().swap(prefixes); }
 
   96   const std::vector<UTF8StringSlice8Bit>& Words()
 const { 
return words; }
 
   98   const std::vector<UTF8StringSlice8Bit>& WordCandidates()
 const {
 
   99     return wordCandidates;
 
  105     double suffixEntropy;
 
  106     double prefixEntropy;
 
  127   void ExtractSuffixes();
 
  129   void ExtractPrefixes();
 
  131   void ExtractWordCandidates();
 
  133   void CalculateFrequency();
 
  135   void CalculateCohesions();
 
  137   void CalculateSuffixEntropy();
 
  139   void CalculatePrefixEntropy();
 
  161   double CalculateEntropy(
 
  165   LengthType wordMinLength;
 
  166   LengthType wordMaxLength;
 
  167   LengthType prefixSetLength;
 
  168   LengthType suffixSetLength;
 
  170       preCalculationFilter;
 
  172       postCalculationFilter;
 
  174   bool prefixesExtracted;
 
  175   bool suffixesExtracted;
 
  176   bool frequenciesCalculated;
 
  177   bool wordCandidatesExtracted;
 
  178   bool cohesionsCalculated;
 
  179   bool prefixEntropiesCalculated;
 
  180   bool suffixEntropiesCalculated;
 
  184   size_t totalOccurrence;
 
  185   double logTotalOccurrence;
 
  186   std::vector<UTF8StringSlice8Bit> prefixes;
 
  187   std::vector<UTF8StringSlice8Bit> suffixes;
 
  188   std::vector<UTF8StringSlice8Bit> wordCandidates;
 
  189   std::vector<UTF8StringSlice8Bit> words;
 
  192   friend class PhraseExtractTest;