Data Handling
This guide covers data loading, validation, transformation, and management in openseries.
Loading Data
From pandas DataFrame/Series
The most common way to load data is from pandas objects:
import pandas as pd
from openseries import OpenTimeSeries
# From pandas Series with DatetimeIndex
data = pd.Series([100, 101, 99, 102],
index=pd.date_range('2023-01-01', periods=4))
series = OpenTimeSeries.from_df(dframe=data)
series.set_new_label(lvl_zero="Sample")
# From pandas DataFrame column
df = pd.DataFrame({
'Date': pd.date_range('2023-01-01', periods=4),
'Close': [100, 101, 99, 102],
'Volume': [1000, 1100, 900, 1200]
})
df.set_index('Date', inplace=True)
series = OpenTimeSeries.from_df(dframe=df['Close'])
series.set_new_label(lvl_zero="Stock")
From Arrays
For custom data or when working with lists:
# From date strings and values
dates = ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04']
values = [100.0, 101.0, 99.0, 102.0]
series = OpenTimeSeries.from_arrays(
dates=dates,
values=values,
name="Custom Data"
)
From Fixed Rate
Generate synthetic data from a fixed rate:
from datetime import date
# Create 252 trading days at 5% annual rate
series = OpenTimeSeries.from_fixed_rate(
rate=0.05,
days=252,
end_date=date(2023, 12, 31),
name="5% Fixed Rate"
)
Data Validation
Date Format Validation
openseries enforces strict date formats:
# Valid date formats
valid_dates = ['2023-01-01', '2023-12-31', '2024-02-29'] # ISO format
# Invalid formats will raise ValidationError
# This will fail with a validation error
invalid_series = OpenTimeSeries.from_arrays(
dates=['01/01/2023', '2023-1-1'], # Wrong format
values=[100, 101]
)
Value Validation
Values must be numeric and finite:
import numpy as np
# Valid values
valid_values = [100.0, 101.5, 99.25, 102.75]
# Handle NaN values appropriately
values_with_nan = [100.0, np.nan, 99.0, 102.0]
series = OpenTimeSeries.from_arrays(
dates=['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04'],
values=values_with_nan,
name="Data with NaN"
)
# Clean NaN values (modifies original)
series.value_nan_handle() # Forward fill
Length Consistency
Dates and values must have the same length:
# This will raise an error
# This will fail with a length mismatch error
invalid_series = OpenTimeSeries.from_arrays(
dates=['2023-01-01', '2023-01-02'],
values=[100.0, 101.0, 102.0] # Different length
)
Data Transformations
Price and Return Conversions
# Assume we have a price series
prices = OpenTimeSeries.from_arrays(
dates=['2023-01-01', '2023-01-02', '2023-01-03'],
values=[100.0, 102.0, 99.0],
name="Stock Price"
)
# Convert to simple returns (modifies original)
prices.value_to_ret()
print(f"Returns: {prices.values}") # [0.02, -0.0294...]
# Convert to log returns (modifies original)
prices.value_to_log()
# Convert returns back to cumulative values (modifies original)
prices.to_cumret()
# Convert to differences (absolute changes) (modifies original)
prices.value_to_diff()
Resampling
Change the frequency of your data:
# Daily to monthly (business month end) (modifies original)
series.resample_to_business_period_ends(freq="BME")
# Daily to quarterly (modifies original)
series.resample_to_business_period_ends(freq="BQE")
# Daily to annual (modifies original)
series.resample_to_business_period_ends(freq="BYE")
# Custom resampling with pandas frequency strings (modifies original)
series.resample(freq="W")
# Resample with specific method (modifies original)
series.resample(freq="W", method="mean")
Business Day Alignment
Align data to business day calendars:
# Align to US business days (modifies original)
series.align_index_to_local_cdays(countries="US")
# Align to multiple countries (intersection) (modifies original)
series.align_index_to_local_cdays(countries=["US", "GB", "JP"])
# Align to specific market calendar (modifies original)
series.align_index_to_local_cdays(markets="NYSE")
Handling Missing Data
NaN Handling Strategies
import numpy as np
# Create series with missing values
dates = ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04']
values = [100.0, np.nan, 102.0, np.nan]
series_with_nan = OpenTimeSeries.from_arrays(
dates=dates, values=values, name="With NaN"
)
# Forward fill missing values (for price series) (modifies original)
series_with_nan.value_nan_handle()
# For return series, replace NaN with 0.0 (modifies original)
series_with_nan.value_to_ret()
series_with_nan.return_nan_handle()
Dropping Missing Data
# Remove NaN values entirely (modifies original)
series_with_nan.value_nan_handle(method="drop")
Working with Multiple Assets
Creating OpenFrame
from openseries import OpenFrame
# Create multiple series
series1 = OpenTimeSeries.from_arrays(
dates=['2023-01-01', '2023-01-02', '2023-01-03'],
values=[100, 102, 99], name="Asset A"
)
series2 = OpenTimeSeries.from_arrays(
dates=['2023-01-01', '2023-01-02', '2023-01-03'],
values=[50, 51, 49], name="Asset B"
)
# Create frame
frame = OpenFrame(constituents=[series1, series2])
Handling Different Date Ranges
OpenFrame automatically handles series with different date ranges:
# Series with different start/end dates
early_series = OpenTimeSeries.from_arrays(
dates=['2022-12-01', '2023-01-01', '2023-01-02'],
values=[95, 100, 102], name="Early Start"
)
late_series = OpenTimeSeries.from_arrays(
dates=['2023-01-02', '2023-01-03', '2023-01-04'],
values=[51, 49, 52], name="Late Start"
)
# Frame will align to common date range
frame = OpenFrame(constituents=[early_series, late_series])
print(f"Frame date range: {frame.first_idx} to {frame.last_idx}")
Adding and Removing Series
# Add a new series
new_series = OpenTimeSeries.from_arrays(
dates=['2023-01-01', '2023-01-02', '2023-01-03'],
values=[200, 205, 198], name="Asset C"
)
frame.add_timeseries(new_series)
# Remove a series by index
frame.delete_timeseries(item_idx=0)
Data Export and Import
Excel Export
# Export single series
series.to_xlsx(filename="single_series.xlsx")
# Export frame (multiple series)
frame.to_xlsx(filename="multiple_series.xlsx")
# Export with custom sheet title
series.to_xlsx(
filename="formatted_export.xlsx",
sheet_title="Analysis"
)
JSON Export
# Export series values only
series.to_json(what_output="values", filename="series_values.json")
# Export full dataframe structure
series.to_json(what_output="tsdf", filename="series_dataframe.json")
# Export frame data
frame.to_json(what_output="values", filename="frame_values.json")
Working with Real Data Sources
Yahoo Finance Integration
import yfinance as yf
# Single asset
ticker = yf.Ticker("AAPL")
data = ticker.history(period="2y")
apple = OpenTimeSeries.from_df(
dframe=data['Close'],
name="Apple Inc."
)
# Multiple assets
tickers = ["AAPL", "GOOGL", "MSFT"]
series_list = []
for ticker_symbol in tickers:
ticker = yf.Ticker(ticker_symbol)
data = ticker.history(period="1y")
series = OpenTimeSeries.from_df(
dframe=data['Close'],
name=ticker_symbol
)
series_list.append(series)
tech_frame = OpenFrame(constituents=series_list)
CSV Data
# Load from CSV
df = pd.read_csv("stock_data.csv", index_col=0, parse_dates=True)
series = OpenTimeSeries.from_df(
dframe=df['Close'],
name="Stock from CSV"
)
Data Quality Checks
Validation Methods
# Check for data quality issues
print(f"Series length: {series.length}")
print(f"Date range: {series.first_idx} to {series.last_idx}")
print(f"Span of days: {series.span_of_days}")
# Check for gaps in data
expected_length = (series.last_idx - series.first_idx).days + 1
actual_length = series.length
if expected_length != actual_length:
print(f"Data gaps detected: expected {expected_length}, got {actual_length}")
Outlier Detection
# Convert to returns for outlier analysis (modifies original)
series.value_to_ret()
# Detect outliers using the built-in method
outliers = series.outliers(threshold=3.0)
print(f"Found {len(outliers)} outliers (|z| > 3)")
# For OpenFrame, outliers returns a DataFrame
frame_outliers = frame.outliers(threshold=3.0)
print(f"Found outliers in frame: {len(frame_outliers)} rows")
# Customize threshold and date range
recent_outliers = series.outliers(
threshold=2.5,
months_from_last=6
)
Performance Considerations
Memory Usage
# For large datasets, consider resampling
large_series = series # Assume this is large daily data
# Reduce to monthly for analysis (modifies original)
large_series.resample_to_business_period_ends(freq="BME")
# Use monthly for computationally intensive operations
monthly_metrics = large_series.all_properties()
Efficient Data Loading
# When loading multiple assets, batch the operations
tickers = ["AAPL", "GOOGL", "MSFT", "AMZN", "TSLA"]
# Download all at once
data = yf.download(tickers, period="2y")['Close']
# Create series efficiently
series_list = []
for ticker in tickers:
series = OpenTimeSeries.from_df(
dframe=data[ticker].dropna(),
name=ticker
)
series_list.append(series)
frame = OpenFrame(constituents=series_list)
This comprehensive guide should help you handle various data scenarios effectively with openseries.