-
Notifications
You must be signed in to change notification settings - Fork 0
/
fred.py
392 lines (363 loc) · 17.7 KB
/
fred.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
import os
import sys
if sys.version_info[0] >= 3:
from urllib.request import urlopen
from urllib.parse import quote_plus
from urllib.parse import urlencode
from urllib.error import HTTPError
else:
from urllib2 import urlopen
from urllib2 import HTTPError
from urllib import quote_plus
from urllib import urlencode
import xml.etree.ElementTree as ET
import pandas as pd
class Fred(object):
earliest_realtime_start = '1776-07-04'
latest_realtime_end = '9999-12-31'
nan_char = '.'
max_results_per_request = 1000
def __init__(self,
api_key='',
api_key_file=None):
"""
Initialize the Fred class that provides useful functions to query the Fred dataset. You need to specify a valid
API key in one of 3 ways: pass the string via api_key, or set api_key_file to a file with the api key in the
first line, or set the environment variable 'FRED_API_KEY' to the value of your api key. You can sign up for a
free api key on the Fred website at http://research.stlouisfed.org/fred2/
"""
self.api_key = None
if api_key is not None:
self.api_key = api_key
elif api_key_file is not None:
f = open(api_key_file, 'r')
self.api_key = f.readline().strip()
f.close()
else:
self.api_key = os.environ.get('FRED_API_KEY')
if self.api_key is None:
raise ValueError("You need to set a valid API key. You can set it in 3 ways: pass the string with api_key, "
"or set api_key_file to a file with the api key in the first line, or set the environment "
"variable 'FRED_API_KEY' to the value of your api key. You can sign up for a free api key "
"on the Fred website at http://research.stlouisfed.org/fred2/")
def __fetch_data(self, url):
"""
helper function for fetching data given a request URL
"""
try:
response = urlopen(url)
root = ET.fromstring(response.read())
except HTTPError as exc:
root = ET.fromstring(exc.read())
raise ValueError(root.get('message'))
return root
def _parse(self, date_str, format='%Y-%m-%d'):
"""
helper function for parsing FRED date string into datetime
"""
return pd.to_datetime(date_str, format=format).to_datetime()
def get_series_info(self, series_id):
"""
Get information about a series such as its title, frequency, observation start/end dates, units, notes, etc.
Parameters
----------
series_id : str
Fred series id such as 'CPIAUCSL'
Returns
-------
info : Series
a pandas Series containing information about the Fred series
"""
url = "http://api.stlouisfed.org/fred/series?series_id=%s&api_key=%s" % (series_id, self.api_key)
root = self.__fetch_data(url)
if root is None:
raise ValueError('No info exists for series id: ' + series_id)
info = pd.Series(root.getchildren()[0].attrib)
return info
def get_series(self, series_id, observation_start=None, observation_end=None, **kwargs):
"""
Get data for a Fred series id. This fetches the latest known data, and is equivalent to get_series_latest_release()
Parameters
----------
series_id : str
Fred series id such as 'CPIAUCSL'
observation_start : datetime or datetime-like str such as '7/1/2014', optional
earliest observation date
observation_end : datetime or datetime-like str such as '7/1/2014', optional
latest observation date
kwargs : additional parameters
Any additional parameters supported by FRED. You can see http://api.stlouisfed.org/docs/fred/series_observations.html for the full list
Returns
-------
data : Series
a Series where each index is the observation date and the value is the data for the Fred series
"""
url = "http://api.stlouisfed.org/fred/series/observations?series_id=%s&api_key=%s" % (series_id, self.api_key)
if observation_start is not None:
observation_start = pd.to_datetime(observation_start, errors='raise')
url += '&observation_start=' + observation_start.strftime('%Y-%m-%d')
if observation_end is not None:
observation_end = pd.to_datetime(observation_end, errors='raise')
url += '&observation_end=' + observation_end.strftime('%Y-%m-%d')
if kwargs is not None:
url += '&' + urlencode(kwargs)
root = self.__fetch_data(url)
if root is None:
raise ValueError('No data exists for series id: ' + series_id)
data = {}
for child in root.getchildren():
val = child.get('value')
if val == self.nan_char:
val = float('NaN')
else:
val = float(val)
data[self._parse(child.get('date'))] = val
return pd.Series(data)
def get_series_latest_release(self, series_id):
"""
Get data for a Fred series id. This fetches the latest known data, and is equivalent to get_series()
Parameters
----------
series_id : str
Fred series id such as 'CPIAUCSL'
Returns
-------
info : Series
a Series where each index is the observation date and the value is the data for the Fred series
"""
return self.get_series(series_id)
def get_series_first_release(self, series_id):
"""
Get first-release data for a Fred series id. This ignores any revision to the data series. For instance,
The US GDP for Q1 2014 was first released to be 17149.6, and then later revised to 17101.3, and 17016.0.
This will ignore revisions after the first release.
Parameters
----------
series_id : str
Fred series id such as 'GDP'
Returns
-------
data : Series
a Series where each index is the observation date and the value is the data for the Fred series
"""
df = self.get_series_all_releases(series_id)
first_release = df.groupby('date').head(1)
data = first_release.set_index('date')['value']
return data
def get_series_as_of_date(self, series_id, as_of_date):
"""
Get latest data for a Fred series id as known on a particular date. This includes any revision to the data series
before or on as_of_date, but ignores any revision on dates after as_of_date.
Parameters
----------
series_id : str
Fred series id such as 'GDP'
as_of_date : datetime, or datetime-like str such as '10/25/2014'
Include data revisions on or before this date, and ignore revisions afterwards
Returns
-------
data : Series
a Series where each index is the observation date and the value is the data for the Fred series
"""
as_of_date = pd.to_datetime(as_of_date)
df = self.get_series_all_releases(series_id)
data = df[df['realtime_start'] <= as_of_date]
return data
def get_series_all_releases(self, series_id):
"""
Get all data for a Fred series id including first releases and all revisions. This returns a DataFrame
with three columns: 'date', 'realtime_start', and 'value'. For instance, the US GDP for Q4 2013 was first released
to be 17102.5 on 2014-01-30, and then revised to 17080.7 on 2014-02-28, and then revised to 17089.6 on
2014-03-27. You will therefore get three rows with the same 'date' (observation date) of 2013-10-01 but three
different 'realtime_start' of 2014-01-30, 2014-02-28, and 2014-03-27 with corresponding 'value' of 17102.5, 17080.7
and 17089.6
Parameters
----------
series_id : str
Fred series id such as 'GDP'
Returns
-------
data : DataFrame
a DataFrame with columns 'date', 'realtime_start' and 'value' where 'date' is the observation period and 'realtime_start'
is when the corresponding value (either first release or revision) is reported.
"""
url = "http://api.stlouisfedorg/fred/series/observations?series_id=%s&api_key=%s&realtime_start=%s&realtime_end=%s" % (series_id,
self.api_key,
self.earliest_realtime_start,
self.latest_realtime_end)
root = self.__fetch_data(url)
if root is None:
raise ValueError('No data exists for series id: ' + series_id)
data = {}
i = 0
for child in root.getchildren():
val = child.get('value')
if val == self.nan_char:
val = float('NaN')
else:
val = float(val)
realtime_start = self._parse(child.get('realtime_start'))
# realtime_end = self._parse(child.get('realtime_end'))
date = self._parse(child.get('date'))
data[i] = {'realtime_start': realtime_start,
# 'realtime_end': realtime_end,
'date': date,
'value': val}
i += 1
data = pd.DataFrame(data).T
return data
def get_series_vintage_dates(self, series_id):
"""
Get a list of vintage dates for a series. Vintage dates are the dates in history when a
series' data values were revised or new data values were released.
Parameters
----------
series_id : str
Fred series id such as 'CPIAUCSL'
Returns
-------
dates : list
list of vintage dates
"""
url = "http://api.stlouisfed.org/fred/series/vintagedates?series_id=%s&api_key=%s" % (series_id, self.api_key)
root = self.__fetch_data(url)
if root is None:
raise ValueError('No vintage date exists for series id: ' + series_id)
dates = []
for child in root.getchildren():
dates.append(self._parse(child.text))
return dates
def __do_series_search(self, url):
"""
helper function for making one HTTP request for data, and parsing the returned results into a DataFrame
"""
root = self.__fetch_data(url)
series_ids = []
data = {}
num_results_returned = 0 # number of results returned in this HTTP request
num_results_total = int(root.get('count')) # total number of results, this can be larger than number of results returned
for child in root.getchildren():
num_results_returned += 1
series_id = child.get('id')
series_ids.append(series_id)
data[series_id] = {"id": series_id}
fields = ["realtime_start", "realtime_end", "title", "observation_start", "observation_end",
"frequency", "frequency_short", "units", "units_short", "seasonal_adjustment",
"seasonal_adjustment_short", "last_updated", "popularity", "notes"]
for field in fields:
data[series_id][field] = child.get(field)
if num_results_returned > 0:
data = pd.DataFrame(data, columns=series_ids).T
# parse datetime columns
for field in ["realtime_start", "realtime_end", "observation_start", "observation_end", "last_updated"]:
data[field] = data[field].apply(self._parse, format=None)
# set index name
data.index.name = 'series id'
else:
data = None
return data, num_results_total
def __get_search_results(self, url, limit, order_by, sort_order):
"""
helper function for getting search results up to specified limit on the number of results. The Fred HTTP API
truncates to 1000 results per request, so this may issue multiple HTTP requests to obtain more available data.
"""
order_by_options = ['search_rank', 'series_id', 'title', 'units', 'frequency',
'seasonal_adjustment', 'realtime_start', 'realtime_end', 'last_updated',
'observation_start', 'observation_end', 'popularity']
if order_by is not None:
if order_by in order_by_options:
url = url + '&order_by=' + order_by
else:
raise ValueError('%s is not in the valid list of order_by options: %s' % (order_by, str(order_by_options)))
sort_order_options = ['asc', 'desc']
if sort_order is not None:
if sort_order in sort_order_options:
url = url + '&sort_order=' + sort_order
else:
raise ValueError('%s is not in the valid list of sort_order options: %s' % (sort_order, str(sort_order_options)))
data, num_results_total = self.__do_series_search(url)
if data is None:
return data
if limit == 0:
max_results_needed = num_results_total
else:
max_results_needed = limit
if max_results_needed > self.max_results_per_request:
for i in range(1, max_results_needed // self.max_results_per_request + 1):
offset = i * self.max_results_per_request
next_data, _ = self.__do_series_search(url + '&offset=' + str(offset))
data = data.append(next_data)
return data.head(max_results_needed)
def search(self, text, limit=1000, order_by=None, sort_order=None):
"""
Do a fulltext search for series in the Fred dataset. Returns information about matching series in a DataFrame.
Parameters
----------
text : str
text to do fulltext search on, e.g., 'Real GDP'
limit : int, optional
limit the number of results to this value. If limit is 0, it means fetching all results without limit.
order_by : str, optional
order the results by a criterion. Valid options are 'search_rank', 'series_id', 'title', 'units', 'frequency',
'seasonal_adjustment', 'realtime_start', 'realtime_end', 'last_updated', 'observation_start', 'observation_end',
'popularity'
sort_order : str, optional
sort the results by ascending or descending order. Valid options are 'asc' or 'desc'
Returns
-------
info : DataFrame
a DataFrame containing information about the matching Fred series
"""
url = "http://api.stlouisfed.org/fred/series/search?search_text=%s&api_key=%s" % (quote_plus(text), self.api_key)
info = self.__get_search_results(url, limit, order_by, sort_order)
return info
def search_by_release(self, release_id, limit=0, order_by=None, sort_order=None):
"""
Search for series that belongs to a release id. Returns information about matching series in a DataFrame.
Parameters
----------
release_id : int
release id, e.g., 151
limit : int, optional
limit the number of results to this value. If limit is 0, it means fetching all results without limit.
order_by : str, optional
order the results by a criterion. Valid options are 'search_rank', 'series_id', 'title', 'units', 'frequency',
'seasonal_adjustment', 'realtime_start', 'realtime_end', 'last_updated', 'observation_start', 'observation_end',
'popularity'
sort_order : str, optional
sort the results by ascending or descending order. Valid options are 'asc' or 'desc'
Returns
-------
info : DataFrame
a DataFrame containing information about the matching Fred series
"""
url = "http://api.stlouisfed.org/fred/release/series?release_id=%d&&api_key=%s" % (release_id, self.api_key)
info = self.__get_search_results(url, limit, order_by, sort_order)
if info is None:
raise ValueError('No series exists for release id: ' + str(release_id))
return info
def search_by_category(self, category_id, limit=0, order_by=None, sort_order=None):
"""
Search for series that belongs to a category id. Returns information about matching series in a DataFrame.
Parameters
----------
category_id : int
category id, e.g., 32145
limit : int, optional
limit the number of results to this value. If limit is 0, it means fetching all results without limit.
order_by : str, optional
order the results by a criterion. Valid options are 'search_rank', 'series_id', 'title', 'units', 'frequency',
'seasonal_adjustment', 'realtime_start', 'realtime_end', 'last_updated', 'observation_start', 'observation_end',
'popularity'
sort_order : str, optional
sort the results by ascending or descending order. Valid options are 'asc' or 'desc'
Returns
-------
info : DataFrame
a DataFrame containing information about the matching Fred series
"""
url = "http://api.stlouisfed.org/fred/category/series?category_id=%d&api_key=%s" % (category_id, self.api_key)
info = self.__get_search_results(url, limit, order_by, sort_order)
if info is None:
raise ValueError('No series exists for category id: ' + str(category_id))
return info