支持缓存
This commit is contained in:
@@ -2,7 +2,10 @@
|
||||
import argparse
|
||||
import datetime
|
||||
import os
|
||||
from typing import List, Optional
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from pathlib import Path
|
||||
import threading
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
import requests
|
||||
from myscripts import working_tool
|
||||
import dotenv
|
||||
@@ -91,60 +94,97 @@ def get_pair_data_file_path(pair: str, config_path: str, timeframe: str) -> Opti
|
||||
print(f"Warning: Failed to get data file path for pair {pair}: {e}")
|
||||
return None
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
|
||||
DATE_CACHE_DIR = Path('.cache') / 'pairs_date'
|
||||
|
||||
def _get_cache_path(file_path: Path) -> Path:
|
||||
mtime = file_path.stat().st_mtime
|
||||
file_key = f"{file_path}:{mtime}"
|
||||
hash_key = hashlib.sha256(file_key.encode()).hexdigest()[:32]
|
||||
return DATE_CACHE_DIR / f"{hash_key}.json"
|
||||
|
||||
def _load_date_cache(file_path: Path) -> Optional[Tuple[pd.Timestamp, pd.Timestamp]]:
|
||||
cache_path = _get_cache_path(file_path)
|
||||
if cache_path.exists():
|
||||
try:
|
||||
with open(cache_path, 'r') as f:
|
||||
data = json.load(f)
|
||||
return (pd.Timestamp(data['min']), pd.Timestamp(data['max']))
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
def _save_date_cache(file_path: Path, date_min: pd.Timestamp, date_max: pd.Timestamp) -> None:
|
||||
DATE_CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
||||
cache_path = _get_cache_path(file_path)
|
||||
try:
|
||||
with open(cache_path, 'w') as f:
|
||||
json.dump({'min': str(date_min), 'max': str(date_max)}, f)
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to save date cache for {file_path}: {e}")
|
||||
|
||||
def filter_pairs_by_timerange(pairs: List[str], timerange: str, config_path: str, timeframe: str) -> List[str]:
|
||||
"""Filter pairs based on timerange, checking if data exists for the given timerange"""
|
||||
print(f"Checking pair data availability for timerange: {timerange}...")
|
||||
valid_pairs = []
|
||||
invalid_pairs = []
|
||||
|
||||
try:
|
||||
# Parse timerange string into start and end dates
|
||||
start_str, end_str = timerange.split('-')
|
||||
start_date = datetime.datetime.strptime(start_str, '%Y%m%d')
|
||||
end_date = datetime.datetime.strptime(end_str, '%Y%m%d')
|
||||
|
||||
# Convert to pandas Timestamps for comparison
|
||||
start_ts = pd.Timestamp(start_date)
|
||||
end_ts = pd.Timestamp(end_date)
|
||||
|
||||
for pair in pairs:
|
||||
file_path = get_pair_data_file_path(pair, config_path, timeframe)
|
||||
if file_path is not None:
|
||||
try:
|
||||
# Read only the date column to check data availability
|
||||
df = pd.read_feather(file_path, columns=['date'])
|
||||
if not df.empty:
|
||||
# Convert date column to datetime
|
||||
df['date'] = pd.to_datetime(df['date'])
|
||||
|
||||
# Remove timezone if present
|
||||
if df['date'].dt.tz is not None:
|
||||
df['date'] = df['date'].dt.tz_convert('UTC').dt.tz_localize(None)
|
||||
|
||||
# Check if there's data within the timerange
|
||||
mask = (df['date'] >= start_ts) & (df['date'] <= end_ts)
|
||||
if any(mask):
|
||||
valid_pairs.append(pair)
|
||||
else:
|
||||
invalid_pairs.append(pair)
|
||||
else:
|
||||
invalid_pairs.append(pair)
|
||||
except Exception as e:
|
||||
print(f"Warning: Error reading data for pair {pair}: {e}")
|
||||
invalid_pairs.append(pair)
|
||||
else:
|
||||
invalid_pairs.append(pair)
|
||||
except Exception as e:
|
||||
print(f"Error parsing timerange: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def get_date_range(file_path: Path) -> Tuple[pd.Timestamp, pd.Timestamp]:
|
||||
cached = _load_date_cache(file_path)
|
||||
if cached is not None:
|
||||
return cached
|
||||
|
||||
try:
|
||||
df = pd.read_feather(file_path, columns=['date'])
|
||||
if df.empty:
|
||||
result = (pd.Timestamp.max, pd.Timestamp.min)
|
||||
else:
|
||||
df['date'] = pd.to_datetime(df['date'])
|
||||
if df['date'].dt.tz is not None:
|
||||
df['date'] = df['date'].dt.tz_convert('UTC').dt.tz_localize(None)
|
||||
result = (df['date'].min(), df['date'].max())
|
||||
|
||||
_save_date_cache(file_path, result[0], result[1])
|
||||
return result
|
||||
except Exception as e:
|
||||
print(f"Warning: Error reading data for {file_path}: {e}")
|
||||
result = (pd.Timestamp.max, pd.Timestamp.min)
|
||||
_save_date_cache(file_path, result[0], result[1])
|
||||
return result
|
||||
|
||||
def check_pair(pair: str) -> Tuple[str, bool]:
|
||||
file_path = get_pair_data_file_path(pair, config_path, timeframe)
|
||||
if file_path is None:
|
||||
return (pair, False)
|
||||
|
||||
file_min, file_max = get_date_range(file_path)
|
||||
has_data = not (file_max < start_ts or file_min > end_ts)
|
||||
return (pair, has_data)
|
||||
|
||||
max_workers = min(32, len(pairs), os.cpu_count() or 4 * 5)
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
results = list(executor.map(check_pair, pairs))
|
||||
|
||||
valid_pairs = [pair for pair, is_valid in results if is_valid]
|
||||
invalid_pairs = [pair for pair, is_valid in results if not is_valid]
|
||||
|
||||
if invalid_pairs:
|
||||
print(f"Filtered out {len(invalid_pairs)} pairs with no data in the specified timerange:")
|
||||
for pair in invalid_pairs[:10]: # Show only first 10
|
||||
for pair in invalid_pairs[:10]:
|
||||
print(f" - {pair}")
|
||||
if len(invalid_pairs) > 10:
|
||||
print(f" ... and {len(invalid_pairs) - 10} more pairs")
|
||||
|
||||
|
||||
print(f"Kept {len(valid_pairs)} pairs with available data")
|
||||
return valid_pairs
|
||||
|
||||
|
||||
Reference in New Issue
Block a user