Ticket #11591: HTSearchParser.cpp

File HTSearchParser.cpp, 5.7 KB (added by waddlesplash, 9 years ago)

File that causes the issue.

Line 
1/*
2 * Copyright 2011 Martin Hebnes Pedersen, martinhpedersen @ "google mail"
3 * All rights reserved. Distributed under the terms of the MIT License.
4 */
5
6#include "HTSearchParser.h"
7
8#include <string>
9#include <cstring>
10#include <iostream>
11#include <sstream>
12#include <List.h>
13
14using namespace std;
15
16//Define some constants for parsing
17namespace TwitterSearchTags {
18 const char* ENTRY_TAG = "<entry>";
19 const char* WHEN_TAG = "<published>";
20 const char* CONTENT_TAG = "<title>";
21 const char* NAME_TAG = "<name>";
22 const char* WHERE_TAG = "<uri>";
23 const char* SOURCE_TAG = "<twitter:source>";
24 const char* ID_TAG = "<id>";
25 const char* PROFILEIMAGEURL_TAG = "<link>";
26}
27
28//Define some constants for using the twitter api
29namespace TwitterSearchAPI {
30 const string TIME_FORMAT = "%d-%d-%dT%d:%d:%dZ";//2011-04-02T13:37:57Z
31 const string ID_FORMAT = "tag:search.twitter.com,2005:%llu";//tag:search.twitter.com,2005:50093060641656832
32}
33
34HTSearchParser::HTSearchParser()
35 : HTTimelineParser()
36{
37 fTweets = new BList();
38}
39
40HTSearchParser::~HTSearchParser()
41{ }
42
43status_t
44HTSearchParser::Parse(const std::string& data)
45{
46 status_t status = B_OK;
47
48 #ifdef DEBUG_ENABLED
49 std::cout << "SearchParser: parsing data:" << std::endl;
50 std::cout << data << std::endl;
51 #endif
52
53 if(data.length() < 30) {
54 std::cout << "Parse error: data was empty" << std::endl;
55 return B_ERROR;
56 }
57
58 //Devide data into nodes.
59 BList* nodeList = new BList();
60 string buffer("");
61 size_t pos = 0;
62 while(true) {
63 pos = FindValue(&buffer, TwitterSearchTags::ENTRY_TAG, data, pos, false); //We don't want to bother with decoding html right now
64 if(pos == string::npos)
65 break;
66 nodeList->AddItem(new string(buffer));
67 }
68 if(nodeList->IsEmpty()) {
69 delete nodeList;
70 return B_OK;
71 }
72
73 //Parse nodes
74 status = _ParseNodes(nodeList, fTweets);
75
76 //Delete all nodes (data has been copied to HTTweets)
77 while(!nodeList->IsEmpty())
78 delete (string *)nodeList->RemoveItem((int32)0);
79 delete nodeList;
80
81 return status;
82}
83
84status_t
85HTSearchParser::_ParseNodes(BList* nodeList, BList* resultList)
86{
87 status_t status = B_OK;
88
89 string* parsingNode;
90 HTTweet* currentTweet;
91 string buffer("");
92
93 for(int32 i = 0; i < nodeList->CountItems(); i++) {
94 status = B_OK;
95 buffer = string("");
96 parsingNode = (string *)nodeList->ItemAt(i);
97 if(parsingNode == NULL)
98 return B_BAD_INDEX;
99 currentTweet = new HTTweet();
100
101 //Content
102 if(FindValue(&buffer, TwitterSearchTags::CONTENT_TAG, *parsingNode, 0) == string::npos)
103 status = B_ERROR;
104 else
105 currentTweet->setText(buffer.c_str());
106
107 //Screen name & real name
108 if(status == B_OK && FindValue(&buffer, TwitterSearchTags::NAME_TAG, *parsingNode, 0) == string::npos)
109 status = B_ERROR;
110 else {
111 //Parse (Format: "martinhpedersen (Martin H. Pedersen)"
112 size_t screenNameEndIndex = buffer.find(" (");
113 size_t fullNameEndIndex = buffer.find(")");
114
115 if(fullNameEndIndex != std::string::npos) {
116 currentTweet->setFullName(buffer.substr(screenNameEndIndex+2, fullNameEndIndex-2-screenNameEndIndex).c_str());
117 currentTweet->setScreenName(buffer.substr(0, screenNameEndIndex).c_str());
118 }
119 else {
120 status = B_ERROR;
121 std::cout << "Error parsing for full/screen name" << std::endl;
122 }
123 }
124
125 //Profile image url
126 if(status == B_OK && _FindProfileImage(&buffer, *parsingNode) == string::npos)
127 status = B_ERROR;
128 else
129 currentTweet->setProfileImageUrl(buffer.c_str());
130
131 //When
132 if(status == B_OK && FindValue(&buffer, TwitterSearchTags::WHEN_TAG, *parsingNode, 0) == string::npos)
133 status = B_ERROR;
134 else
135 currentTweet->setDate( _StrToTime(buffer.c_str()) );
136
137 //Source
138 if(status == B_OK && FindValue(&buffer, TwitterSearchTags::SOURCE_TAG, *parsingNode, 0) == string::npos)
139 status = B_ERROR;
140 else {
141 // Parse the data for Application name
142 size_t pos = buffer.find(">", 0); //<a href="http://www.tweetdeck.com/" rel="nofollow">TweetDeck</a>
143 if(pos != std::string::npos) {
144 size_t start = pos;
145 size_t end = pos;
146 while(end < buffer.length() && buffer[end] != '<') {
147 end++;
148 }
149 string sourceName = buffer.substr(start+1, end-start-1);
150 currentTweet->setSourceName(sourceName.c_str());
151 }
152 else
153 currentTweet->setSourceName(buffer.c_str());
154 }
155
156 //ExternalId
157 if(status == B_OK && FindValue(&buffer, TwitterSearchTags::ID_TAG, *parsingNode, 0) == string::npos)
158 status = B_ERROR;
159 else
160 currentTweet->setId( _StrToId(buffer.c_str()) );
161
162 if(status == B_OK)
163 resultList->AddItem(currentTweet);
164 else
165 delete currentTweet;
166 }
167
168 return status;
169}
170
171size_t
172HTSearchParser::_FindProfileImage(std::string* buffer, const std::string& data)
173{
174 const char* tag = "<link type=\"image/png\" href=\"";
175 std::string endTag("\" rel=\"image\"/>");
176
177 size_t start = data.find(tag, 0);
178 size_t end;
179 if(start == std::string::npos)
180 return std::string::npos;
181
182 start += strlen(tag);
183 end = data.find(endTag, start);
184
185 if(end != std::string::npos)
186 *buffer = data.substr(start, end-start).c_str();
187
188 return end;
189}
190
191//Convert from Twitter format to time_t
192time_t
193HTSearchParser::_StrToTime(const char* str)
194{
195 struct tm when;
196 int32 yyyy=0, mm=0, dd=0, hour=0, min=0, sec=0;
197
198 sscanf(str, TwitterSearchAPI::TIME_FORMAT.c_str(), &yyyy, &mm, &dd, &hour, &min, &sec);
199
200 when.tm_year = yyyy-1900; //tm_year is year since 1900
201 when.tm_mon = mm-1; //tm_mon range: 0-11
202 when.tm_mday = dd; //tm_mday range: 1-31
203 when.tm_hour = hour; //tm_hour range: 0-23
204 when.tm_min = min; //tm_min range: 0-59
205 when.tm_sec = sec; //tm_sec range: 0-59
206
207 return mktime(&when);
208}
209
210//Convert from Twitter id string to uint64
211uint64
212HTSearchParser::_StrToId(const char* str)
213{
214 uint64 id = 0;
215
216 sscanf(str, TwitterSearchAPI::ID_FORMAT.c_str(), &id);
217
218 return id;
219}