1 #ifndef SENTENCE_TOKENIZER_HDR
2 #define SENTENCE_TOKENIZER_HDR
6 #include "concordia/common/config.hpp"
7 #include "concordia/tokenized_sentence.hpp"
8 #include "concordia/regex_rule.hpp"
9 #include "concordia/concordia_config.hpp"
10 #include "concordia/concordia_exception.hpp"
11 #include <boost/shared_ptr.hpp>
12 #include <boost/filesystem.hpp>
41 bool byWhitespace =
false);
44 void _createNeRules(std::string & namedEntitiesPath);
46 void _createHtmlTagsRule(std::string & htmlTagsPath);
48 boost::shared_ptr<RegexRule> _getMultipleRegexRule(
52 bool wholeWord =
false);
54 std::vector<RegexRule> _namedEntities;
56 boost::shared_ptr<RegexRule> _htmlTags;
58 bool _stopWordsEnabled;
60 boost::shared_ptr<RegexRule> _stopWords;
Definition: concordia_exception.hpp:11
virtual ~SentenceTokenizer()
Definition: sentence_tokenizer.cpp:24
SentenceTokenizer(boost::shared_ptr< ConcordiaConfig > config)
Definition: sentence_tokenizer.cpp:10
Definition: sentence_tokenizer.hpp:23
TokenizedSentence tokenize(const std::string &sentence, bool byWhitespace=false)
Definition: sentence_tokenizer.cpp:27
Definition: tokenized_sentence.hpp:26