1 | /*
|
---|
2 | * Copyright 2011 Martin Hebnes Pedersen, martinhpedersen @ "google mail"
|
---|
3 | * All rights reserved. Distributed under the terms of the MIT License.
|
---|
4 | */
|
---|
5 |
|
---|
6 | #include "HTSearchParser.h"
|
---|
7 |
|
---|
8 | #include <string>
|
---|
9 | #include <cstring>
|
---|
10 | #include <iostream>
|
---|
11 | #include <sstream>
|
---|
12 | #include <List.h>
|
---|
13 |
|
---|
14 | using namespace std;
|
---|
15 |
|
---|
16 | //Define some constants for parsing
|
---|
17 | namespace TwitterSearchTags {
|
---|
18 | const char* ENTRY_TAG = "<entry>";
|
---|
19 | const char* WHEN_TAG = "<published>";
|
---|
20 | const char* CONTENT_TAG = "<title>";
|
---|
21 | const char* NAME_TAG = "<name>";
|
---|
22 | const char* WHERE_TAG = "<uri>";
|
---|
23 | const char* SOURCE_TAG = "<twitter:source>";
|
---|
24 | const char* ID_TAG = "<id>";
|
---|
25 | const char* PROFILEIMAGEURL_TAG = "<link>";
|
---|
26 | }
|
---|
27 |
|
---|
28 | //Define some constants for using the twitter api
|
---|
29 | namespace TwitterSearchAPI {
|
---|
30 | const string TIME_FORMAT = "%d-%d-%dT%d:%d:%dZ";//2011-04-02T13:37:57Z
|
---|
31 | const string ID_FORMAT = "tag:search.twitter.com,2005:%llu";//tag:search.twitter.com,2005:50093060641656832
|
---|
32 | }
|
---|
33 |
|
---|
34 | HTSearchParser::HTSearchParser()
|
---|
35 | : HTTimelineParser()
|
---|
36 | {
|
---|
37 | fTweets = new BList();
|
---|
38 | }
|
---|
39 |
|
---|
40 | HTSearchParser::~HTSearchParser()
|
---|
41 | { }
|
---|
42 |
|
---|
43 | status_t
|
---|
44 | HTSearchParser::Parse(const std::string& data)
|
---|
45 | {
|
---|
46 | status_t status = B_OK;
|
---|
47 |
|
---|
48 | #ifdef DEBUG_ENABLED
|
---|
49 | std::cout << "SearchParser: parsing data:" << std::endl;
|
---|
50 | std::cout << data << std::endl;
|
---|
51 | #endif
|
---|
52 |
|
---|
53 | if(data.length() < 30) {
|
---|
54 | std::cout << "Parse error: data was empty" << std::endl;
|
---|
55 | return B_ERROR;
|
---|
56 | }
|
---|
57 |
|
---|
58 | //Devide data into nodes.
|
---|
59 | BList* nodeList = new BList();
|
---|
60 | string buffer("");
|
---|
61 | size_t pos = 0;
|
---|
62 | while(true) {
|
---|
63 | pos = FindValue(&buffer, TwitterSearchTags::ENTRY_TAG, data, pos, false); //We don't want to bother with decoding html right now
|
---|
64 | if(pos == string::npos)
|
---|
65 | break;
|
---|
66 | nodeList->AddItem(new string(buffer));
|
---|
67 | }
|
---|
68 | if(nodeList->IsEmpty()) {
|
---|
69 | delete nodeList;
|
---|
70 | return B_OK;
|
---|
71 | }
|
---|
72 |
|
---|
73 | //Parse nodes
|
---|
74 | status = _ParseNodes(nodeList, fTweets);
|
---|
75 |
|
---|
76 | //Delete all nodes (data has been copied to HTTweets)
|
---|
77 | while(!nodeList->IsEmpty())
|
---|
78 | delete (string *)nodeList->RemoveItem((int32)0);
|
---|
79 | delete nodeList;
|
---|
80 |
|
---|
81 | return status;
|
---|
82 | }
|
---|
83 |
|
---|
84 | status_t
|
---|
85 | HTSearchParser::_ParseNodes(BList* nodeList, BList* resultList)
|
---|
86 | {
|
---|
87 | status_t status = B_OK;
|
---|
88 |
|
---|
89 | string* parsingNode;
|
---|
90 | HTTweet* currentTweet;
|
---|
91 | string buffer("");
|
---|
92 |
|
---|
93 | for(int32 i = 0; i < nodeList->CountItems(); i++) {
|
---|
94 | status = B_OK;
|
---|
95 | buffer = string("");
|
---|
96 | parsingNode = (string *)nodeList->ItemAt(i);
|
---|
97 | if(parsingNode == NULL)
|
---|
98 | return B_BAD_INDEX;
|
---|
99 | currentTweet = new HTTweet();
|
---|
100 |
|
---|
101 | //Content
|
---|
102 | if(FindValue(&buffer, TwitterSearchTags::CONTENT_TAG, *parsingNode, 0) == string::npos)
|
---|
103 | status = B_ERROR;
|
---|
104 | else
|
---|
105 | currentTweet->setText(buffer.c_str());
|
---|
106 |
|
---|
107 | //Screen name & real name
|
---|
108 | if(status == B_OK && FindValue(&buffer, TwitterSearchTags::NAME_TAG, *parsingNode, 0) == string::npos)
|
---|
109 | status = B_ERROR;
|
---|
110 | else {
|
---|
111 | //Parse (Format: "martinhpedersen (Martin H. Pedersen)"
|
---|
112 | size_t screenNameEndIndex = buffer.find(" (");
|
---|
113 | size_t fullNameEndIndex = buffer.find(")");
|
---|
114 |
|
---|
115 | if(fullNameEndIndex != std::string::npos) {
|
---|
116 | currentTweet->setFullName(buffer.substr(screenNameEndIndex+2, fullNameEndIndex-2-screenNameEndIndex).c_str());
|
---|
117 | currentTweet->setScreenName(buffer.substr(0, screenNameEndIndex).c_str());
|
---|
118 | }
|
---|
119 | else {
|
---|
120 | status = B_ERROR;
|
---|
121 | std::cout << "Error parsing for full/screen name" << std::endl;
|
---|
122 | }
|
---|
123 | }
|
---|
124 |
|
---|
125 | //Profile image url
|
---|
126 | if(status == B_OK && _FindProfileImage(&buffer, *parsingNode) == string::npos)
|
---|
127 | status = B_ERROR;
|
---|
128 | else
|
---|
129 | currentTweet->setProfileImageUrl(buffer.c_str());
|
---|
130 |
|
---|
131 | //When
|
---|
132 | if(status == B_OK && FindValue(&buffer, TwitterSearchTags::WHEN_TAG, *parsingNode, 0) == string::npos)
|
---|
133 | status = B_ERROR;
|
---|
134 | else
|
---|
135 | currentTweet->setDate( _StrToTime(buffer.c_str()) );
|
---|
136 |
|
---|
137 | //Source
|
---|
138 | if(status == B_OK && FindValue(&buffer, TwitterSearchTags::SOURCE_TAG, *parsingNode, 0) == string::npos)
|
---|
139 | status = B_ERROR;
|
---|
140 | else {
|
---|
141 | // Parse the data for Application name
|
---|
142 | size_t pos = buffer.find(">", 0); //<a href="http://www.tweetdeck.com/" rel="nofollow">TweetDeck</a>
|
---|
143 | if(pos != std::string::npos) {
|
---|
144 | size_t start = pos;
|
---|
145 | size_t end = pos;
|
---|
146 | while(end < buffer.length() && buffer[end] != '<') {
|
---|
147 | end++;
|
---|
148 | }
|
---|
149 | string sourceName = buffer.substr(start+1, end-start-1);
|
---|
150 | currentTweet->setSourceName(sourceName.c_str());
|
---|
151 | }
|
---|
152 | else
|
---|
153 | currentTweet->setSourceName(buffer.c_str());
|
---|
154 | }
|
---|
155 |
|
---|
156 | //ExternalId
|
---|
157 | if(status == B_OK && FindValue(&buffer, TwitterSearchTags::ID_TAG, *parsingNode, 0) == string::npos)
|
---|
158 | status = B_ERROR;
|
---|
159 | else
|
---|
160 | currentTweet->setId( _StrToId(buffer.c_str()) );
|
---|
161 |
|
---|
162 | if(status == B_OK)
|
---|
163 | resultList->AddItem(currentTweet);
|
---|
164 | else
|
---|
165 | delete currentTweet;
|
---|
166 | }
|
---|
167 |
|
---|
168 | return status;
|
---|
169 | }
|
---|
170 |
|
---|
171 | size_t
|
---|
172 | HTSearchParser::_FindProfileImage(std::string* buffer, const std::string& data)
|
---|
173 | {
|
---|
174 | const char* tag = "<link type=\"image/png\" href=\"";
|
---|
175 | std::string endTag("\" rel=\"image\"/>");
|
---|
176 |
|
---|
177 | size_t start = data.find(tag, 0);
|
---|
178 | size_t end;
|
---|
179 | if(start == std::string::npos)
|
---|
180 | return std::string::npos;
|
---|
181 |
|
---|
182 | start += strlen(tag);
|
---|
183 | end = data.find(endTag, start);
|
---|
184 |
|
---|
185 | if(end != std::string::npos)
|
---|
186 | *buffer = data.substr(start, end-start).c_str();
|
---|
187 |
|
---|
188 | return end;
|
---|
189 | }
|
---|
190 |
|
---|
191 | //Convert from Twitter format to time_t
|
---|
192 | time_t
|
---|
193 | HTSearchParser::_StrToTime(const char* str)
|
---|
194 | {
|
---|
195 | struct tm when;
|
---|
196 | int32 yyyy=0, mm=0, dd=0, hour=0, min=0, sec=0;
|
---|
197 |
|
---|
198 | sscanf(str, TwitterSearchAPI::TIME_FORMAT.c_str(), &yyyy, &mm, &dd, &hour, &min, &sec);
|
---|
199 |
|
---|
200 | when.tm_year = yyyy-1900; //tm_year is year since 1900
|
---|
201 | when.tm_mon = mm-1; //tm_mon range: 0-11
|
---|
202 | when.tm_mday = dd; //tm_mday range: 1-31
|
---|
203 | when.tm_hour = hour; //tm_hour range: 0-23
|
---|
204 | when.tm_min = min; //tm_min range: 0-59
|
---|
205 | when.tm_sec = sec; //tm_sec range: 0-59
|
---|
206 |
|
---|
207 | return mktime(&when);
|
---|
208 | }
|
---|
209 |
|
---|
210 | //Convert from Twitter id string to uint64
|
---|
211 | uint64
|
---|
212 | HTSearchParser::_StrToId(const char* str)
|
---|
213 | {
|
---|
214 | uint64 id = 0;
|
---|
215 |
|
---|
216 | sscanf(str, TwitterSearchAPI::ID_FORMAT.c_str(), &id);
|
---|
217 |
|
---|
218 | return id;
|
---|
219 | }
|
---|