支持缓存

This commit is contained in:
yhydev
2026-01-07 12:17:53 +08:00
parent d361f3a1c1
commit 2ea68d1916

View File

@@ -2,7 +2,10 @@
import argparse import argparse
import datetime import datetime
import os import os
from typing import List, Optional from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
import threading
from typing import Dict, List, Optional, Tuple
import requests import requests
from myscripts import working_tool from myscripts import working_tool
import dotenv import dotenv
@@ -91,56 +94,93 @@ def get_pair_data_file_path(pair: str, config_path: str, timeframe: str) -> Opti
print(f"Warning: Failed to get data file path for pair {pair}: {e}") print(f"Warning: Failed to get data file path for pair {pair}: {e}")
return None return None
import hashlib
import json
DATE_CACHE_DIR = Path('.cache') / 'pairs_date'
def _get_cache_path(file_path: Path) -> Path:
mtime = file_path.stat().st_mtime
file_key = f"{file_path}:{mtime}"
hash_key = hashlib.sha256(file_key.encode()).hexdigest()[:32]
return DATE_CACHE_DIR / f"{hash_key}.json"
def _load_date_cache(file_path: Path) -> Optional[Tuple[pd.Timestamp, pd.Timestamp]]:
cache_path = _get_cache_path(file_path)
if cache_path.exists():
try:
with open(cache_path, 'r') as f:
data = json.load(f)
return (pd.Timestamp(data['min']), pd.Timestamp(data['max']))
except Exception:
pass
return None
def _save_date_cache(file_path: Path, date_min: pd.Timestamp, date_max: pd.Timestamp) -> None:
DATE_CACHE_DIR.mkdir(parents=True, exist_ok=True)
cache_path = _get_cache_path(file_path)
try:
with open(cache_path, 'w') as f:
json.dump({'min': str(date_min), 'max': str(date_max)}, f)
except Exception as e:
print(f"Warning: Failed to save date cache for {file_path}: {e}")
def filter_pairs_by_timerange(pairs: List[str], timerange: str, config_path: str, timeframe: str) -> List[str]: def filter_pairs_by_timerange(pairs: List[str], timerange: str, config_path: str, timeframe: str) -> List[str]:
"""Filter pairs based on timerange, checking if data exists for the given timerange""" """Filter pairs based on timerange, checking if data exists for the given timerange"""
print(f"Checking pair data availability for timerange: {timerange}...") print(f"Checking pair data availability for timerange: {timerange}...")
valid_pairs = []
invalid_pairs = []
try: try:
# Parse timerange string into start and end dates
start_str, end_str = timerange.split('-') start_str, end_str = timerange.split('-')
start_date = datetime.datetime.strptime(start_str, '%Y%m%d') start_date = datetime.datetime.strptime(start_str, '%Y%m%d')
end_date = datetime.datetime.strptime(end_str, '%Y%m%d') end_date = datetime.datetime.strptime(end_str, '%Y%m%d')
# Convert to pandas Timestamps for comparison
start_ts = pd.Timestamp(start_date) start_ts = pd.Timestamp(start_date)
end_ts = pd.Timestamp(end_date) end_ts = pd.Timestamp(end_date)
for pair in pairs:
file_path = get_pair_data_file_path(pair, config_path, timeframe)
if file_path is not None:
try:
# Read only the date column to check data availability
df = pd.read_feather(file_path, columns=['date'])
if not df.empty:
# Convert date column to datetime
df['date'] = pd.to_datetime(df['date'])
# Remove timezone if present
if df['date'].dt.tz is not None:
df['date'] = df['date'].dt.tz_convert('UTC').dt.tz_localize(None)
# Check if there's data within the timerange
mask = (df['date'] >= start_ts) & (df['date'] <= end_ts)
if any(mask):
valid_pairs.append(pair)
else:
invalid_pairs.append(pair)
else:
invalid_pairs.append(pair)
except Exception as e:
print(f"Warning: Error reading data for pair {pair}: {e}")
invalid_pairs.append(pair)
else:
invalid_pairs.append(pair)
except Exception as e: except Exception as e:
print(f"Error parsing timerange: {e}") print(f"Error parsing timerange: {e}")
return [] return []
def get_date_range(file_path: Path) -> Tuple[pd.Timestamp, pd.Timestamp]:
cached = _load_date_cache(file_path)
if cached is not None:
return cached
try:
df = pd.read_feather(file_path, columns=['date'])
if df.empty:
result = (pd.Timestamp.max, pd.Timestamp.min)
else:
df['date'] = pd.to_datetime(df['date'])
if df['date'].dt.tz is not None:
df['date'] = df['date'].dt.tz_convert('UTC').dt.tz_localize(None)
result = (df['date'].min(), df['date'].max())
_save_date_cache(file_path, result[0], result[1])
return result
except Exception as e:
print(f"Warning: Error reading data for {file_path}: {e}")
result = (pd.Timestamp.max, pd.Timestamp.min)
_save_date_cache(file_path, result[0], result[1])
return result
def check_pair(pair: str) -> Tuple[str, bool]:
file_path = get_pair_data_file_path(pair, config_path, timeframe)
if file_path is None:
return (pair, False)
file_min, file_max = get_date_range(file_path)
has_data = not (file_max < start_ts or file_min > end_ts)
return (pair, has_data)
max_workers = min(32, len(pairs), os.cpu_count() or 4 * 5)
with ThreadPoolExecutor(max_workers=max_workers) as executor:
results = list(executor.map(check_pair, pairs))
valid_pairs = [pair for pair, is_valid in results if is_valid]
invalid_pairs = [pair for pair, is_valid in results if not is_valid]
if invalid_pairs: if invalid_pairs:
print(f"Filtered out {len(invalid_pairs)} pairs with no data in the specified timerange:") print(f"Filtered out {len(invalid_pairs)} pairs with no data in the specified timerange:")
for pair in invalid_pairs[:10]: # Show only first 10 for pair in invalid_pairs[:10]:
print(f" - {pair}") print(f" - {pair}")
if len(invalid_pairs) > 10: if len(invalid_pairs) > 10:
print(f" ... and {len(invalid_pairs) - 10} more pairs") print(f" ... and {len(invalid_pairs) - 10} more pairs")