22 #include <unordered_map>
25 #include "UTF8StringSlice.hpp"
31 typedef UTF8StringSlice::LengthType LengthType;
39 void Extract(
const std::string& text) {
43 CalculateSuffixEntropy();
46 CalculatePrefixEntropy();
48 ExtractWordCandidates();
53 void SetFullText(
const std::string& fullText) {
57 void SetFullText(
const char* fullText) {
61 void SetFullText(
const UTF8StringSlice& fullText) { utf8FullText = fullText; }
63 void SetWordMinLength(
const LengthType _wordMinLength) {
64 wordMinLength = _wordMinLength;
67 void SetWordMaxLength(
const LengthType _wordMaxLength) {
68 wordMaxLength = _wordMaxLength;
71 void SetPrefixSetLength(
const LengthType _prefixSetLength) {
72 prefixSetLength = _prefixSetLength;
75 void SetSuffixSetLength(
const LengthType _suffixSetLength) {
76 suffixSetLength = _suffixSetLength;
80 void SetPreCalculationFilter(
83 preCalculationFilter = filter;
86 void SetPostCalculationFilter(
89 postCalculationFilter = filter;
92 void ReleaseSuffixes() { std::vector<UTF8StringSlice8Bit>().swap(suffixes); }
94 void ReleasePrefixes() { std::vector<UTF8StringSlice8Bit>().swap(prefixes); }
96 const std::vector<UTF8StringSlice8Bit>& Words()
const {
return words; }
98 const std::vector<UTF8StringSlice8Bit>& WordCandidates()
const {
99 return wordCandidates;
105 double suffixEntropy;
106 double prefixEntropy;
127 void ExtractSuffixes();
129 void ExtractPrefixes();
131 void ExtractWordCandidates();
133 void CalculateFrequency();
135 void CalculateCohesions();
137 void CalculateSuffixEntropy();
139 void CalculatePrefixEntropy();
161 double CalculateEntropy(
165 LengthType wordMinLength;
166 LengthType wordMaxLength;
167 LengthType prefixSetLength;
168 LengthType suffixSetLength;
170 preCalculationFilter;
172 postCalculationFilter;
174 bool prefixesExtracted;
175 bool suffixesExtracted;
176 bool frequenciesCalculated;
177 bool wordCandidatesExtracted;
178 bool cohesionsCalculated;
179 bool prefixEntropiesCalculated;
180 bool suffixEntropiesCalculated;
184 size_t totalOccurrence;
185 double logTotalOccurrence;
186 std::vector<UTF8StringSlice8Bit> prefixes;
187 std::vector<UTF8StringSlice8Bit> suffixes;
188 std::vector<UTF8StringSlice8Bit> wordCandidates;
189 std::vector<UTF8StringSlice8Bit> words;
192 friend class PhraseExtractTest;
Definition: UTF8StringSlice.hpp:202
Definition: UTF8StringSlice.hpp:54