Concordia
tokenized_sentence.hpp
1 #ifndef TOKENIZED_SENTENCE_HDR
2 #define TOKENIZED_SENTENCE_HDR
3 
4 #include "concordia/common/config.hpp"
5 #include "concordia/token_annotation.hpp"
6 #include "concordia/word_map.hpp"
7 
8 #include <boost/shared_ptr.hpp>
9 #include <string>
10 #include <vector>
11 #include <list>
12 #include <iostream>
13 #include <boost/foreach.hpp>
14 
15 
16 
27 public:
32  explicit TokenizedSentence(std::string sentence);
33 
36  virtual ~TokenizedSentence();
37 
42  std::string getSentence() const {
43  return _sentence;
44  }
45 
50  std::string getOriginalSentence() const {
51  return _originalSentence;
52  }
53 
58  std::string getTokenizedSentence() const;
59 
65  std::list<TokenAnnotation> getAnnotations() const {
66  return _tokenAnnotations;
67  }
68 
73  std::vector<INDEX_CHARACTER_TYPE> getCodes() const {
74  return _codes;
75  }
76 
82  std::vector<TokenAnnotation> getTokens() const {
83  return _tokens;
84  }
85 
93  void generateHash(boost::shared_ptr<WordMap> wordMap);
94 
102  void generateTokens();
103 
107  void toLowerCase();
108 
118  void addAnnotations(std::vector<TokenAnnotation> annotations);
119 
120  friend std::ostream & operator << (std::ostream & o,
121  const TokenizedSentence & ts) {
122  int index = 0;
123  BOOST_FOREACH(TokenAnnotation token, ts.getAnnotations()) {
124  o << "[" << token.getStart() << "," << token.getEnd() << "]["
125  << token.getType() << "][" << token.getValue() <<"]";
126  if (index < ts.getAnnotations().size() - 1) {
127  o << " ";
128  }
129  index++;
130  }
131  return o;
132  }
133 
134 
135 private:
136  std::string _sentence;
137 
138  std::string _originalSentence;
139 
140  std::list<TokenAnnotation> _tokenAnnotations;
141 
142  std::vector<INDEX_CHARACTER_TYPE> _codes;
143 
144  std::vector<TokenAnnotation> _tokens;
145 };
146 
147 #endif
std::string getSentence() const
Definition: tokenized_sentence.hpp:42
virtual ~TokenizedSentence()
Definition: tokenized_sentence.cpp:14
std::string getOriginalSentence() const
Definition: tokenized_sentence.hpp:50
void generateTokens()
Definition: tokenized_sentence.cpp:70
void toLowerCase()
Definition: tokenized_sentence.cpp:56
int getType() const
Definition: token_annotation.hpp:36
std::list< TokenAnnotation > getAnnotations() const
Definition: tokenized_sentence.hpp:65
SUFFIX_MARKER_TYPE getStart() const
Definition: interval.hpp:49
SUFFIX_MARKER_TYPE getEnd() const
Definition: interval.hpp:56
Definition: tokenized_sentence.hpp:26
std::vector< TokenAnnotation > getTokens() const
Definition: tokenized_sentence.hpp:82
TokenizedSentence(std::string sentence)
Definition: tokenized_sentence.cpp:9
void generateHash(boost::shared_ptr< WordMap > wordMap)
Definition: tokenized_sentence.cpp:60
std::string getTokenizedSentence() const
Definition: tokenized_sentence.cpp:79
std::vector< INDEX_CHARACTER_TYPE > getCodes() const
Definition: tokenized_sentence.hpp:73
Definition: token_annotation.hpp:16
std::string getValue() const
Definition: token_annotation.hpp:43
void addAnnotations(std::vector< TokenAnnotation > annotations)
Definition: tokenized_sentence.cpp:17