UNPKG

cursor-rules-agent

Version:

Minimal Cursor IDE workflow rules integration with auto-mapping and multi-source capabilities

412 lines (360 loc) 12.9 kB
# {{PROJECT_NAME}} Scraping Architecture Blueprint # Generated: {{GENERATED_DATE}} # Version: {{VERSION}} # Target Website: {{TARGET_WEBSITE}} metadata: name: "{{PROJECT_NAME}}_Scraping" version: "{{VERSION}}" description: "Comprehensive scraping architecture for {{TARGET_WEBSITE}}" owner: "{{OWNER}}" architect: "{{ARCHITECT}}" created: "{{CREATED_DATE}}" updated: "{{UPDATED_DATE}}" status: "{{STATUS}}" # draft|review|approved|deprecated target_analysis_ref: "{{TARGET_ANALYSIS_FILE}}" # Target Website Information target_website: name: "{{TARGET_NAME}}" url: "{{TARGET_URL}}" type: "{{WEBSITE_TYPE}}" # news|ecommerce|social|entertainment|blog complexity: "{{COMPLEXITY}}" # simple|moderate|complex|enterprise estimated_pages: {{ESTIMATED_PAGES}} content_freshness: "{{CONTENT_FRESHNESS}}" # static|daily|hourly|realtime # Scraping Strategy scraping_strategy: primary_approach: "{{PRIMARY_APPROACH}}" # api_first|scraping_primary|hybrid fallback_approach: "{{FALLBACK_APPROACH}}" api_integration: available: {{API_AVAILABLE}} # true|false endpoints: - path: "{{API_ENDPOINT_1}}" method: "{{HTTP_METHOD_1}}" rate_limit: "{{RATE_LIMIT_1}}" authentication: "{{AUTH_TYPE_1}}" - path: "{{API_ENDPOINT_2}}" method: "{{HTTP_METHOD_2}}" rate_limit: "{{RATE_LIMIT_2}}" authentication: "{{AUTH_TYPE_2}}" scraping_config: method: "{{SCRAPING_METHOD}}" # browser_automation|http_requests|hybrid tools: - "{{TOOL_1}}" # playwright|puppeteer|scrapy|requests - "{{TOOL_2}}" browser_requirements: headless: {{HEADLESS_MODE}} javascript: {{JAVASCRIPT_REQUIRED}} cookies: {{COOKIES_REQUIRED}} # Architecture Components architecture: type: "scraping_system" pattern: "{{ARCHITECTURE_PATTERN}}" # microservices|monolith|modular components: orchestrator: name: "scraping_orchestrator" technology: "{{ORCHESTRATOR_TECH}}" # nodejs|python|golang responsibilities: - "Schedule scraping jobs" - "Monitor scraper health" - "Distribute workload" - "Handle failures and retries" scaling: "{{ORCHESTRATOR_SCALING}}" # single|cluster|distributed scrapers: count: {{SCRAPER_COUNT}} technology: "{{SCRAPER_TECH}}" concurrent_limit: {{CONCURRENT_LIMIT}} instances: - name: "content_scraper" target: "{{CONTENT_TARGET}}" selectors: "{{CONTENT_SELECTORS}}" - name: "media_scraper" target: "{{MEDIA_TARGET}}" selectors: "{{MEDIA_SELECTORS}}" data_pipeline: processing_type: "{{PROCESSING_TYPE}}" # realtime|batch|hybrid validation_level: "{{VALIDATION_LEVEL}}" # strict|moderate|flexible stages: - name: "extraction" description: "Raw data extraction from target" technology: "{{EXTRACTION_TECH}}" - name: "validation" description: "Data quality validation" rules: "{{VALIDATION_RULES}}" - name: "transformation" description: "Data format transformation" format: "{{OUTPUT_FORMAT}}" - name: "enrichment" description: "Data enrichment and augmentation" sources: "{{ENRICHMENT_SOURCES}}" - name: "storage" description: "Persistent data storage" destination: "{{STORAGE_DESTINATION}}" # Data Models & Schema data_models: entities: - name: "{{ENTITY_1}}" description: "{{ENTITY_1_DESCRIPTION}}" source_selectors: title: "{{TITLE_SELECTOR}}" content: "{{CONTENT_SELECTOR}}" metadata: "{{METADATA_SELECTOR}}" fields: - name: "{{FIELD_1}}" type: "{{FIELD_TYPE_1}}" required: {{FIELD_REQUIRED_1}} source: "{{FIELD_SOURCE_1}}" validation: "{{FIELD_VALIDATION_1}}" - name: "{{FIELD_2}}" type: "{{FIELD_TYPE_2}}" required: {{FIELD_REQUIRED_2}} source: "{{FIELD_SOURCE_2}}" validation: "{{FIELD_VALIDATION_2}}" - name: "{{ENTITY_2}}" description: "{{ENTITY_2_DESCRIPTION}}" source_selectors: identifier: "{{ID_SELECTOR}}" content: "{{CONTENT_SELECTOR_2}}" fields: - name: "{{FIELD_1}}" type: "{{FIELD_TYPE_1}}" required: {{FIELD_REQUIRED_1}} relationships: - from: "{{FROM_ENTITY}}" to: "{{TO_ENTITY}}" type: "{{RELATIONSHIP_TYPE}}" # one_to_one|one_to_many|many_to_many foreign_key: "{{FK_FIELD}}" # Database Architecture database: primary: type: "{{DB_TYPE}}" # mongodb|postgresql|mysql connection: "{{DB_CONNECTION}}" performance_config: indexes: - fields: ["{{INDEX_FIELD_1}}"] type: "{{INDEX_TYPE_1}}" - fields: ["{{INDEX_FIELD_2}}", "{{INDEX_FIELD_3}}"] type: "{{INDEX_TYPE_2}}" partitioning: enabled: {{PARTITIONING_ENABLED}} strategy: "{{PARTITIONING_STRATEGY}}" caching: layers: - name: "redis_cache" type: "redis" ttl: {{CACHE_TTL}} max_size: "{{CACHE_SIZE}}" use_cases: ["{{CACHE_USE_CASE_1}}", "{{CACHE_USE_CASE_2}}"] backup: strategy: "{{BACKUP_STRATEGY}}" frequency: "{{BACKUP_FREQUENCY}}" retention: "{{BACKUP_RETENTION}}" # Performance & Scalability performance: requirements: pages_per_hour: {{PAGES_PER_HOUR}} data_freshness: "{{DATA_FRESHNESS}}" # <1hour|<4hours|<24hours system_availability: "{{AVAILABILITY}}" # 99.9%|99.99% optimization: request_throttling: enabled: {{THROTTLING_ENABLED}} requests_per_second: {{REQUESTS_PER_SECOND}} burst_limit: {{BURST_LIMIT}} caching_strategy: page_cache: {{PAGE_CACHE_ENABLED}} result_cache: {{RESULT_CACHE_ENABLED}} cdn_integration: {{CDN_ENABLED}} resource_management: memory_limit: "{{MEMORY_LIMIT}}" cpu_limit: "{{CPU_LIMIT}}" disk_space: "{{DISK_SPACE}}" # Error Handling & Resilience resilience: retry_mechanisms: exponential_backoff: {{EXPONENTIAL_BACKOFF}} max_retries: {{MAX_RETRIES}} timeout_escalation: [{{TIMEOUT_1}}, {{TIMEOUT_2}}, {{TIMEOUT_3}}] fallback_strategies: cache_fallback: {{CACHE_FALLBACK}} alternative_sources: ["{{ALT_SOURCE_1}}", "{{ALT_SOURCE_2}}"] graceful_degradation: {{GRACEFUL_DEGRADATION}} monitoring: health_checks: - endpoint: "{{HEALTH_ENDPOINT_1}}" frequency: "{{HEALTH_FREQUENCY_1}}" expected_response: {{EXPECTED_RESPONSE_1}} alert_thresholds: failure_rate: {{FAILURE_THRESHOLD}} # 0.1 = 10% response_time: {{RESPONSE_TIME_THRESHOLD}} # milliseconds memory_usage: {{MEMORY_THRESHOLD}} # percentage logging: level: "{{LOG_LEVEL}}" # debug|info|warn|error destinations: ["{{LOG_DEST_1}}", "{{LOG_DEST_2}}"] retention: "{{LOG_RETENTION}}" # Security & Compliance security: anti_bot_countermeasures: user_agent_rotation: {{UA_ROTATION}} proxy_rotation: {{PROXY_ROTATION}} request_randomization: {{REQUEST_RANDOMIZATION}} captcha_handling: {{CAPTCHA_HANDLING}} rate_limiting: respect_robots_txt: {{RESPECT_ROBOTS}} custom_rate_limits: {{CUSTOM_RATE_LIMITS}} rate_limit_config: - domain: "{{DOMAIN_1}}" requests_per_minute: {{RPM_1}} concurrent_requests: {{CONCURRENT_1}} data_protection: encrypt_at_rest: {{ENCRYPT_REST}} encrypt_in_transit: {{ENCRYPT_TRANSIT}} pii_handling: "{{PII_HANDLING}}" # anonymize|exclude|secure compliance: gdpr_compliant: {{GDPR_COMPLIANT}} ccpa_compliant: {{CCPA_COMPLIANT}} robots_txt_compliance: {{ROBOTS_COMPLIANCE}} terms_of_service_review: "{{TOS_REVIEW_STATUS}}" # Deployment & Infrastructure deployment: hosting: type: "{{HOSTING_TYPE}}" # cloud|vps|on_premise provider: "{{HOSTING_PROVIDER}}" regions: ["{{REGION_1}}", "{{REGION_2}}"] environment_variables: - name: "{{ENV_VAR_1}}" description: "{{ENV_DESC_1}}" required: {{ENV_REQUIRED_1}} default: "{{ENV_DEFAULT_1}}" - name: "{{ENV_VAR_2}}" description: "{{ENV_DESC_2}}" required: {{ENV_REQUIRED_2}} scaling: auto_scaling: {{AUTO_SCALING}} min_instances: {{MIN_INSTANCES}} max_instances: {{MAX_INSTANCES}} scaling_triggers: - metric: "{{SCALING_METRIC_1}}" threshold: {{SCALING_THRESHOLD_1}} action: "{{SCALING_ACTION_1}}" # Testing Strategy testing: unit_tests: framework: "{{TEST_FRAMEWORK}}" coverage_target: {{COVERAGE_TARGET}} critical_paths: ["{{TEST_PATH_1}}", "{{TEST_PATH_2}}"] integration_tests: scraping_validation: - test_name: "{{TEST_NAME_1}}" target_url: "{{TEST_URL_1}}" expected_fields: ["{{FIELD_1}}", "{{FIELD_2}}"] data_quality_checks: ["{{CHECK_1}}", "{{CHECK_2}}"] performance_tests: - test_name: "load_test" concurrent_scrapers: {{LOAD_TEST_SCRAPERS}} duration: "{{LOAD_TEST_DURATION}}" success_criteria: "{{LOAD_TEST_CRITERIA}}" monitoring_tests: uptime_monitoring: {{UPTIME_MONITORING}} data_quality_monitoring: {{QUALITY_MONITORING}} performance_monitoring: {{PERFORMANCE_MONITORING}} # Quality Gates quality_gates: gate_1_analysis: - "Target website analysis completed" - "Legal compliance verified" - "Technical feasibility confirmed" - "Data models defined" gate_2_architecture: - "Scraping strategy approved" - "Architecture components designed" - "Performance requirements defined" - "Security measures planned" gate_3_implementation: - "Core scraping functionality implemented" - "Data pipeline operational" - "Error handling implemented" - "Basic monitoring setup" gate_4_production: - "Full test suite passing" - "Performance benchmarks met" - "Security review completed" - "Production monitoring active" # Risk Assessment risks: technical_risks: - risk: "{{TECH_RISK_1}}" probability: "{{RISK_PROB_1}}" # low|medium|high impact: "{{RISK_IMPACT_1}}" # low|medium|high mitigation: "{{RISK_MITIGATION_1}}" owner: "{{RISK_OWNER_1}}" operational_risks: - risk: "{{OP_RISK_1}}" probability: "{{RISK_PROB_1}}" impact: "{{RISK_IMPACT_1}}" mitigation: "{{RISK_MITIGATION_1}}" owner: "{{RISK_OWNER_1}}" compliance_risks: - risk: "{{COMPLIANCE_RISK_1}}" probability: "{{RISK_PROB_1}}" impact: "{{RISK_IMPACT_1}}" mitigation: "{{RISK_MITIGATION_1}}" owner: "{{RISK_OWNER_1}}" # Implementation Plan implementation: phases: phase_1: name: "Infrastructure Setup" duration: "{{PHASE_1_DURATION}}" deliverables: - "{{PHASE_1_DELIVERABLE_1}}" - "{{PHASE_1_DELIVERABLE_2}}" dependencies: ["{{PHASE_1_DEPENDENCY_1}}"] phase_2: name: "Core Scraping Implementation" duration: "{{PHASE_2_DURATION}}" deliverables: - "{{PHASE_2_DELIVERABLE_1}}" - "{{PHASE_2_DELIVERABLE_2}}" dependencies: ["phase_1"] phase_3: name: "Data Pipeline & Processing" duration: "{{PHASE_3_DURATION}}" deliverables: - "{{PHASE_3_DELIVERABLE_1}}" - "{{PHASE_3_DELIVERABLE_2}}" dependencies: ["phase_2"] phase_4: name: "Testing & Optimization" duration: "{{PHASE_4_DURATION}}" deliverables: - "{{PHASE_4_DELIVERABLE_1}}" - "{{PHASE_4_DELIVERABLE_2}}" dependencies: ["phase_3"] # Approval & Sign-off approval: gate_status: 0 # 0=draft, 1=approved, 2=implemented stakeholders: - role: "{{STAKEHOLDER_ROLE_1}}" name: "{{STAKEHOLDER_NAME_1}}" approval_status: "{{APPROVAL_STATUS_1}}" # pending|approved|rejected comments: "{{STAKEHOLDER_COMMENTS_1}}" technical_review: reviewer: "{{TECH_REVIEWER}}" review_date: "{{REVIEW_DATE}}" approval_date: "{{APPROVAL_DATE}}" implementation_ready: {{IMPLEMENTATION_READY}} legal_review: reviewer: "{{LEGAL_REVIEWER}}" review_date: "{{LEGAL_REVIEW_DATE}}" compliance_status: "{{COMPLIANCE_STATUS}}" restrictions: "{{LEGAL_RESTRICTIONS}}" # Documentation Links documentation: target_analysis: "{{TARGET_ANALYSIS_LINK}}" api_documentation: "{{API_DOCS_LINK}}" deployment_guide: "{{DEPLOYMENT_GUIDE_LINK}}" monitoring_guide: "{{MONITORING_GUIDE_LINK}}" # Version History version_history: - version: "{{VERSION_1}}" date: "{{VERSION_DATE_1}}" changes: "{{VERSION_CHANGES_1}}" author: "{{VERSION_AUTHOR_1}}"