fine tuning injestion

This commit is contained in:
Jake Pullen
2024-07-28 10:45:59 +01:00
parent fe99ee4469
commit 31b82dc1d0
4 changed files with 68 additions and 27 deletions
+22
View File
@@ -0,0 +1,22 @@
# Budget Management System
This project is a Budget Management System that fetches and caches budget-related data from an API. It organizes the data into various categories and handles rate limits to ensure smooth operation.
## Project Structure
## Setup
1. **Set up environment variables:**
Create a `.env` file in the root directory and add your API token and budget ID:
```
API_TOKEN=your_api_token
BUDGET_ID=your_budget_id
```
## Usage
## Contributing
## License
Binary file not shown.
+35 -26
View File
@@ -1,8 +1,28 @@
import os
import time
import json
import logging
import requests
class injest: class injest:
def __init__(self): def __init__(self, injest_info):
self.API_TOKEN = injest_info['API_TOKEN'],
self.BUDGET_ID = injest_info['BUDGET_ID'],
self.headers = {'Authorization': f'Bearer {self.API_TOKEN}'},
self.entities = ['accounts', 'categories', 'months', 'payees', 'transactions', 'scheduled_transactions'],
self.base_url = injest_info['base_url'],
self.knowledge_file = injest_info['knowledge_file']
self.knowledge_cache = self.load_knowledge_cache()
self.fetch_and_cache_entity_data()
def load_knowledge_cache(self):
if os.path.exists(self.knowledge_file):
with open(self.knowledge_file, 'r') as f:
return json.load(f)
return {}
def update_entity_data_cache(entity, data): def update_entity_data_cache(self,entity, data):
current_time = time.strftime('%Y%m%d%H%M%S') current_time = time.strftime('%Y%m%d%H%M%S')
directory = f'data/{entity}' # Directory name is the entity's name directory = f'data/{entity}' # Directory name is the entity's name
if not os.path.exists(directory): if not os.path.exists(directory):
@@ -11,20 +31,19 @@ class injest:
with open(entity_file, 'w') as f: with open(entity_file, 'w') as f:
json.dump(data, f, indent=4) json.dump(data, f, indent=4)
def update_server_knowledge_cache(entity, server_knowledge): def update_server_knowledge_cache(self,entity, server_knowledge):
knowledge_file = 'server_knowledge_cache.json'
try: try:
with open(knowledge_file, 'r') as f: with open(self.knowledge_file, 'r') as f:
knowledge_cache = json.load(f) knowledge_cache = json.load(f)
except FileNotFoundError: except FileNotFoundError:
knowledge_cache = {} knowledge_cache = {}
knowledge_cache[entity] = server_knowledge knowledge_cache[entity] = server_knowledge
with open(knowledge_file, 'w') as f: with open(self.knowledge_file, 'w') as f:
json.dump(knowledge_cache, f, indent=4) json.dump(knowledge_cache, f, indent=4)
def check_rate_limit(response): def check_rate_limit(self,response):
rate_limit_header = response.headers.get('X-Rate-Limit') rate_limit_header = response.headers.get('X-Rate-Limit')
if rate_limit_header: if rate_limit_header:
requests_made, limit = map(int, rate_limit_header.split('/')) requests_made, limit = map(int, rate_limit_header.split('/'))
@@ -36,28 +55,18 @@ class injest:
else: else:
logging.warning("X-Rate-Limit header is missing.") logging.warning("X-Rate-Limit header is missing.")
def fetch_and_cache_entity_data(budget_id): def fetch_and_cache_entity_data(self):
entities = ['accounts', 'categories', 'months', 'payees', 'transactions', 'scheduled_transactions'] for entity in self.entities:
base_url = 'https://api.ynab.com/v1/budgets' logging.debug(f'entity type is {type(entity)}')
knowledge_file = 'server_knowledge_cache.json' last_knowledge = self.knowledge_cache.get(entity, 0)
# Load existing server knowledge cache
try:
with open(knowledge_file, 'r') as f:
knowledge_cache = json.load(f)
except FileNotFoundError:
knowledge_cache = {}
for entity in entities:
last_knowledge = knowledge_cache.get(entity, 0)
logging.debug(f'Last Knowledge of {entity.capitalize()}: {last_knowledge}') logging.debug(f'Last Knowledge of {entity.capitalize()}: {last_knowledge}')
url = f'{base_url}/{budget_id}/{entity}' url = f'{self.base_url}/{self.budget_id}/{entity}'
if last_knowledge: if last_knowledge:
logging.info(f'Fetching {entity} data since last knowledge: {last_knowledge}') logging.info(f'Fetching {entity} data since last knowledge: {last_knowledge}')
url = url + f'?last_knowledge_of_server={last_knowledge}' url = url + f'?last_knowledge_of_server={last_knowledge}'
response = requests.get(url, headers=headers) response = requests.get(url, headers=self.headers)
check_rate_limit(response) # Check and handle rate limit self.check_rate_limit(response) # Check and handle rate limit
if response.status_code == 429: # HTTP 429 Too Many Requests if response.status_code == 429: # HTTP 429 Too Many Requests
logging.error("Rate limit exceeded. Pausing until the limit is reset.") logging.error("Rate limit exceeded. Pausing until the limit is reset.")
@@ -72,11 +81,11 @@ class injest:
# Check if there is new server knowledge # Check if there is new server knowledge
if server_knowledge is not None and server_knowledge != last_knowledge: if server_knowledge is not None and server_knowledge != last_knowledge:
# Update server knowledge cache # Update server knowledge cache
update_server_knowledge_cache(entity, server_knowledge) self.update_server_knowledge_cache(entity, server_knowledge)
# Update entity data cache without server knowledge # Update entity data cache without server knowledge
entity_data = data['data'] entity_data = data['data']
entity_data.pop('server_knowledge', None) # Remove server knowledge if exists entity_data.pop('server_knowledge', None) # Remove server knowledge if exists
update_entity_data_cache(entity, entity_data) self.update_entity_data_cache(entity, entity_data)
else: else:
logging.info(f"No new data for {entity}. Skipping cache update.") logging.info(f"No new data for {entity}. Skipping cache update.")
+10
View File
@@ -13,3 +13,13 @@ BUDGET_ID = os.getenv('BUDGET_ID')
headers = {'Authorization': f'Bearer {API_TOKEN}'} headers = {'Authorization': f'Bearer {API_TOKEN}'}
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)
injest_info = {}
#entities = ['accounts', 'categories', 'months', 'payees', 'transactions', 'scheduled_transactions']
#injest_info['entities'] = entities
injest_info['base_url'] = 'https://api.ynab.com/v1/budgets'
injest_info['knowledge_file'] = 'server_knowledge_cache.json'
injest_info['API_TOKEN'] = API_TOKEN
injest_info['BUDGET_ID'] = BUDGET_ID
injest(injest_info)#.fetch_and_cache_entity_data()