Concordia
sentence_tokenizer.hpp
1 #ifndef SENTENCE_TOKENIZER_HDR
2 #define SENTENCE_TOKENIZER_HDR
3 
4 #include <string>
5 #include <vector>
6 #include "concordia/common/config.hpp"
7 #include "concordia/tokenized_sentence.hpp"
8 #include "concordia/regex_rule.hpp"
9 #include "concordia/concordia_config.hpp"
10 #include "concordia/concordia_exception.hpp"
11 #include <boost/shared_ptr.hpp>
12 #include <boost/filesystem.hpp>
13 
14 
24 public:
28  explicit SentenceTokenizer(boost::shared_ptr<ConcordiaConfig> config)
29  throw(ConcordiaException);
30 
33  virtual ~SentenceTokenizer();
34 
40  TokenizedSentence tokenize(const std::string & sentence,
41  bool byWhitespace = false);
42 
43 private:
44  void _createNeRules(std::string & namedEntitiesPath);
45 
46  void _createHtmlTagsRule(std::string & htmlTagsPath);
47 
48  boost::shared_ptr<RegexRule> _getMultipleRegexRule(
49  std::string filePath,
50  char annotationType,
51  std::string value,
52  bool wholeWord = false);
53 
54  std::vector<RegexRule> _namedEntities;
55 
56  boost::shared_ptr<RegexRule> _htmlTags;
57 
58  bool _stopWordsEnabled;
59 
60  boost::shared_ptr<RegexRule> _stopWords;
61 };
62 
63 #endif
Definition: concordia_exception.hpp:11
virtual ~SentenceTokenizer()
Definition: sentence_tokenizer.cpp:24
SentenceTokenizer(boost::shared_ptr< ConcordiaConfig > config)
Definition: sentence_tokenizer.cpp:10
Definition: sentence_tokenizer.hpp:23
TokenizedSentence tokenize(const std::string &sentence, bool byWhitespace=false)
Definition: sentence_tokenizer.cpp:27
Definition: tokenized_sentence.hpp:26