支持缓存

2026-01-07 12:17:53 +08:00
parent d361f3a1c1
commit 2ea68d1916
1 changed files with 77 additions and 37 deletions
--- a/cloudbt/run_cloudbt.py
+++ b/cloudbt/run_cloudbt.py
@@ -2,7 +2,10 @@
 import argparse
 import datetime
 import os
-from typing import List, Optional
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
 import threading
 from typing import Dict, List, Optional, Tuple
 import requests
 from myscripts import working_tool
 import dotenv
@@ -91,60 +94,97 @@ def get_pair_data_file_path(pair: str, config_path: str, timeframe: str) -> Opti
        print(f"Warning: Failed to get data file path for pair {pair}: {e}")
        return None
 import hashlib
 import json
 DATE_CACHE_DIR = Path('.cache') / 'pairs_date'
 def _get_cache_path(file_path: Path) -> Path:
    mtime = file_path.stat().st_mtime
    file_key = f"{file_path}:{mtime}"
    hash_key = hashlib.sha256(file_key.encode()).hexdigest()[:32]
    return DATE_CACHE_DIR / f"{hash_key}.json"
 def _load_date_cache(file_path: Path) -> Optional[Tuple[pd.Timestamp, pd.Timestamp]]:
    cache_path = _get_cache_path(file_path)
    if cache_path.exists():
        try:
            with open(cache_path, 'r') as f:
                data = json.load(f)
                return (pd.Timestamp(data['min']), pd.Timestamp(data['max']))
        except Exception:
            pass
    return None
 def _save_date_cache(file_path: Path, date_min: pd.Timestamp, date_max: pd.Timestamp) -> None:
    DATE_CACHE_DIR.mkdir(parents=True, exist_ok=True)
    cache_path = _get_cache_path(file_path)
    try:
        with open(cache_path, 'w') as f:
            json.dump({'min': str(date_min), 'max': str(date_max)}, f)
    except Exception as e:
        print(f"Warning: Failed to save date cache for {file_path}: {e}")
 def filter_pairs_by_timerange(pairs: List[str], timerange: str, config_path: str, timeframe: str) -> List[str]:
    """Filter pairs based on timerange, checking if data exists for the given timerange"""
    print(f"Checking pair data availability for timerange: {timerange}...")
    valid_pairs = []
    invalid_pairs = []
    try:
        # Parse timerange string into start and end dates
        start_str, end_str = timerange.split('-')
        start_date = datetime.datetime.strptime(start_str, '%Y%m%d')
        end_date = datetime.datetime.strptime(end_str, '%Y%m%d')
        # Convert to pandas Timestamps for comparison
        start_ts = pd.Timestamp(start_date)
        end_ts = pd.Timestamp(end_date)
        for pair in pairs:
            file_path = get_pair_data_file_path(pair, config_path, timeframe)
            if file_path is not None:
                try:
                    # Read only the date column to check data availability
                    df = pd.read_feather(file_path, columns=['date'])
                    if not df.empty:
                        # Convert date column to datetime
                        df['date'] = pd.to_datetime(df['date'])
                        # Remove timezone if present
                        if df['date'].dt.tz is not None:
                            df['date'] = df['date'].dt.tz_convert('UTC').dt.tz_localize(None)
                        # Check if there's data within the timerange
                        mask = (df['date'] >= start_ts) & (df['date'] <= end_ts)
                        if any(mask):
                            valid_pairs.append(pair)
                        else:
                            invalid_pairs.append(pair)
                    else:
                        invalid_pairs.append(pair)
                except Exception as e:
                    print(f"Warning: Error reading data for pair {pair}: {e}")
                    invalid_pairs.append(pair)
            else:
                invalid_pairs.append(pair)
    except Exception as e:
        print(f"Error parsing timerange: {e}")
        return []
-    
+
    def get_date_range(file_path: Path) -> Tuple[pd.Timestamp, pd.Timestamp]:
        cached = _load_date_cache(file_path)
        if cached is not None:
            return cached
        try:
            df = pd.read_feather(file_path, columns=['date'])
            if df.empty:
                result = (pd.Timestamp.max, pd.Timestamp.min)
            else:
                df['date'] = pd.to_datetime(df['date'])
                if df['date'].dt.tz is not None:
                    df['date'] = df['date'].dt.tz_convert('UTC').dt.tz_localize(None)
                result = (df['date'].min(), df['date'].max())
            _save_date_cache(file_path, result[0], result[1])
            return result
        except Exception as e:
            print(f"Warning: Error reading data for {file_path}: {e}")
            result = (pd.Timestamp.max, pd.Timestamp.min)
            _save_date_cache(file_path, result[0], result[1])
            return result
    def check_pair(pair: str) -> Tuple[str, bool]:
        file_path = get_pair_data_file_path(pair, config_path, timeframe)
        if file_path is None:
            return (pair, False)
        file_min, file_max = get_date_range(file_path)
        has_data = not (file_max < start_ts or file_min > end_ts)
        return (pair, has_data)
    max_workers = min(32, len(pairs), os.cpu_count() or 4 * 5)
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = list(executor.map(check_pair, pairs))
    valid_pairs = [pair for pair, is_valid in results if is_valid]
    invalid_pairs = [pair for pair, is_valid in results if not is_valid]
    if invalid_pairs:
        print(f"Filtered out {len(invalid_pairs)} pairs with no data in the specified timerange:")
-        for pair in invalid_pairs[:10]:  # Show only first 10
+        for pair in invalid_pairs[:10]:
            print(f"  - {pair}")
        if len(invalid_pairs) > 10:
            print(f"  ... and {len(invalid_pairs) - 10} more pairs")
-    
+
    print(f"Kept {len(valid_pairs)} pairs with available data")
    return valid_pairs