Skip to Content
📚 MyStoryFlow Docs — Your guide to preserving family stories

F003 - File Storage Configuration

Objective

Configure Backblaze B2 storage for the analyzer-app to handle large manuscript files with proper security, versioning, and processing optimization. Integrate with MyStoryFlow’s existing storage patterns.

Requirements

Functional Requirements

  • Support manuscript files up to 50MB (PDF) and 25MB (DOCX)
  • Secure file upload with virus scanning
  • Automatic file processing and content extraction
  • Version control for manuscript revisions
  • Encrypted storage with user-specific access controls
  • CDN integration for fast report delivery

Technical Requirements

  • Configure Backblaze B2 bucket for analyzer-app
  • Implement file type validation and size limits
  • Create secure upload URLs with expiration
  • Integrate with @mystoryflow/shared utilities
  • Configure CDN for report distribution
  • Track storage usage in admin-app

Storage Architecture

1. File Organization Structure

manuscripts/ ├── {user_id}/ │ │ ├── originals/ │ │ │ ├── {manuscript_id}/ │ │ │ │ ├── v1.pdf │ │ │ │ ├── v2.pdf │ │ │ │ └── metadata.json │ │ ├── processed/ │ │ │ ├── {manuscript_id}/ │ │ │ │ ├── extracted_text.txt │ │ │ │ ├── chunks/ │ │ │ │ │ ├── chunk_001.txt │ │ │ │ │ └── chunk_002.txt │ │ │ │ └── analysis_cache.json │ │ └── reports/ │ │ ├── {manuscript_id}/ │ │ │ ├── analysis_report.pdf │ │ │ ├── analysis_report.html │ │ │ └── analysis_data.json

2. Environment Configuration

# Backblaze B2 Configuration (from root .env) NEXT_PUBLIC_BACKBLAZE_BUCKET_NAME=story-analyzer NEXT_PUBLIC_BACKBLAZE_BUCKET_ID=b7704986005b5cb990830317 NEXT_PUBLIC_BACKBLAZE_ENDPOINT=https://s3.us-east-005.backblazeb2.com BACKBLAZE_ACCESS_KEY_ID=00570960bc903370000000007 BACKBLAZE_SECRET_ACCESS_KEY=K0052HJLYTrIvV7w9le/SiRBLIwoVL0 # Storage Limits ANALYZER_MAX_FILE_SIZE_MB=150 ANALYZER_ALLOWED_FILE_TYPES=pdf,docx,txt,rtf,odt,epub # File Processing VIRUS_SCAN_ENABLED=true VIRUS_SCAN_API_KEY=your-virus-scan-key # Retention Policy MANUSCRIPT_RETENTION_DAYS=730 # 2 years REPORT_RETENTION_DAYS=365 # 1 year ARCHIVE_OLD_FILES=true

3. File Upload Service Extension

// packages/supabase/src/storage/manuscript-storage.ts import { StorageClient } from './storage-client' export class ManuscriptStorageService extends StorageClient { private manuscriptBucket = 'manuscript-storage' private reportsBucket = 'analysis-reports' async uploadManuscript( organizationId: string, userId: string, manuscriptId: string, file: File, version: number = 1 ): Promise<UploadResult> { // 1. Validate file type and size await this.validateManuscriptFile(file) // 2. Generate secure upload path const uploadPath = this.generateUploadPath( organizationId, userId, manuscriptId, version, file.name ) // 3. Create virus scan job if (process.env.VIRUS_SCAN_ENABLED === 'true') { await this.scheduleVirusScan(file, uploadPath) } // 4. Upload with encryption const result = await this.uploadWithEncryption( this.manuscriptBucket, uploadPath, file, { organizationId, userId, manuscriptId, version, uploadedAt: new Date().toISOString() } ) // 5. Schedule content extraction await this.scheduleContentExtraction(uploadPath, manuscriptId) return result } private async validateManuscriptFile(file: File): Promise<void> { const allowedTypes = [ 'application/pdf', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'text/plain', 'application/rtf', 'application/vnd.oasis.opendocument.text' ] if (!allowedTypes.includes(file.type)) { throw new Error(`Unsupported file type: ${file.type}`) } const maxSize = this.getMaxFileSize(file.type) if (file.size > maxSize) { throw new Error(`File too large. Maximum size: ${maxSize / 1024 / 1024}MB`) } } private getMaxFileSize(fileType: string): number { const sizeLimits = { 'application/pdf': 50 * 1024 * 1024, // 50MB 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 25 * 1024 * 1024, // 25MB 'text/plain': 10 * 1024 * 1024, // 10MB 'application/rtf': 15 * 1024 * 1024, // 15MB 'application/vnd.oasis.opendocument.text': 25 * 1024 * 1024 // 25MB } return sizeLimits[fileType] || 10 * 1024 * 1024 } private generateUploadPath( organizationId: string, userId: string, manuscriptId: string, version: number, fileName: string ): string { const fileExtension = fileName.split('.').pop() return `manuscripts/${organizationId}/${userId}/originals/${manuscriptId}/v${version}.${fileExtension}` } async createSecureUploadUrl( organizationId: string, userId: string, manuscriptId: string, fileName: string, version: number = 1 ): Promise<SecureUploadUrl> { const uploadPath = this.generateUploadPath( organizationId, userId, manuscriptId, version, fileName ) // Generate presigned URL with 1 hour expiration const presignedUrl = await this.generatePresignedUploadUrl( this.manuscriptBucket, uploadPath, 60 * 60 // 1 hour ) return { uploadUrl: presignedUrl, uploadPath, expiresAt: new Date(Date.now() + 60 * 60 * 1000), maxSize: this.getMaxFileSize(this.getFileTypeFromName(fileName)) } } }

4. Content Extraction Service

// packages/supabase/src/storage/content-extractor.ts export class ContentExtractionService { async extractContent( filePath: string, fileType: string, manuscriptId: string ): Promise<ExtractedContent> { const fileBuffer = await this.downloadFile(filePath) let extractedText: string let metadata: any = {} switch (fileType) { case 'application/pdf': const pdfResult = await this.extractFromPDF(fileBuffer) extractedText = pdfResult.text metadata = { pages: pdfResult.pages, ...pdfResult.metadata } break case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': const docxResult = await this.extractFromDOCX(fileBuffer) extractedText = docxResult.text metadata = docxResult.metadata break case 'text/plain': extractedText = fileBuffer.toString('utf-8') break default: throw new Error(`Unsupported file type for extraction: ${fileType}`) } // Clean and structure the text const cleanText = this.cleanExtractedText(extractedText) const wordCount = this.countWords(cleanText) const structure = await this.analyzeStructure(cleanText) // Store processed content await this.storeProcessedContent(manuscriptId, { originalText: extractedText, cleanText, wordCount, structure, metadata, extractedAt: new Date() }) return { text: cleanText, wordCount, structure, metadata } } private async extractFromPDF(buffer: Buffer): Promise<PDFExtractionResult> { // Use pdf-parse or similar library const pdf = require('pdf-parse') const data = await pdf(buffer) return { text: data.text, pages: data.numpages, metadata: data.info } } private async extractFromDOCX(buffer: Buffer): Promise<DOCXExtractionResult> { // Use mammoth.js or similar library const mammoth = require('mammoth') const result = await mammoth.extractRawText({ buffer }) return { text: result.value, metadata: { messages: result.messages, extractedAt: new Date() } } } private cleanExtractedText(text: string): string { return text .replace(/\r\n/g, '\n') // Normalize line endings .replace(/\n{3,}/g, '\n\n') // Remove excessive line breaks .replace(/\s+/g, ' ') // Normalize whitespace .trim() } private countWords(text: string): number { return text.split(/\s+/).filter(word => word.length > 0).length } private async analyzeStructure(text: string): Promise<DocumentStructure> { // Basic structure detection const lines = text.split('\n') const chapters: Chapter[] = [] let currentChapter: Chapter | null = null for (let i = 0; i < lines.length; i++) { const line = lines[i].trim() // Detect chapter headings (basic patterns) if (this.isChapterHeading(line)) { if (currentChapter) { chapters.push(currentChapter) } currentChapter = { title: line, startLine: i, content: [] } } else if (currentChapter && line.length > 0) { currentChapter.content.push(line) } } if (currentChapter) { chapters.push(currentChapter) } return { chapters, totalChapters: chapters.length, estimatedReadingTime: Math.ceil(this.countWords(text) / 250) // 250 WPM average } } private isChapterHeading(line: string): boolean { const chapterPatterns = [ /^chapter\s+\d+/i, /^ch\.\s*\d+/i, /^\d+\.\s+/, /^part\s+\d+/i ] return chapterPatterns.some(pattern => pattern.test(line)) } }

Database Changes

Additional Tables

-- File Storage Tracking CREATE TABLE analyzer.manuscript_files ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), manuscript_id UUID NOT NULL REFERENCES analyzer.manuscripts(id) ON DELETE CASCADE, version_number INTEGER NOT NULL DEFAULT 1, file_path TEXT NOT NULL, file_name VARCHAR(255) NOT NULL, file_size BIGINT NOT NULL, file_type VARCHAR(50) NOT NULL, storage_bucket VARCHAR(100) NOT NULL, upload_status VARCHAR(50) DEFAULT 'uploading', -- uploading, completed, failed, virus_detected virus_scan_status VARCHAR(50) DEFAULT 'pending', -- pending, clean, infected, failed extraction_status VARCHAR(50) DEFAULT 'pending', -- pending, processing, completed, failed uploaded_at TIMESTAMPTZ DEFAULT NOW(), processed_at TIMESTAMPTZ, created_at TIMESTAMPTZ DEFAULT NOW(), updated_at TIMESTAMPTZ DEFAULT NOW() ); -- Content Extraction Results CREATE TABLE analyzer.extracted_content ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), manuscript_id UUID NOT NULL REFERENCES analyzer.manuscripts(id) ON DELETE CASCADE, file_id UUID NOT NULL REFERENCES analyzer.manuscript_files(id) ON DELETE CASCADE, original_text TEXT, clean_text TEXT NOT NULL, word_count INTEGER NOT NULL, structure_data JSONB DEFAULT '{}', extraction_metadata JSONB DEFAULT '{}', created_at TIMESTAMPTZ DEFAULT NOW() );

API Endpoints

Upload Endpoints

// POST /api/manuscripts/{id}/upload // Generate secure upload URL // POST /api/manuscripts/{id}/versions // Create new version of manuscript // GET /api/manuscripts/{id}/files // List all files for manuscript // DELETE /api/manuscripts/{id}/files/{fileId} // Delete specific file version

Testing Requirements

Unit Tests

// apps/analyzer-app/src/lib/storage/__tests__/manuscript-storage.test.ts import { ManuscriptStorageService } from '../manuscript-storage' import { mockFile } from '@mystoryflow/shared/test-utils' describe('ManuscriptStorageService', () => { let service: ManuscriptStorageService beforeEach(() => { service = new ManuscriptStorageService() }) describe('validateManuscriptFile', () => { it('should accept valid file types', async () => { const pdfFile = mockFile('test.pdf', 'application/pdf', 5 * 1024 * 1024) await expect(service.validateManuscriptFile(pdfFile)).resolves.toBeUndefined() }) it('should reject invalid file types', async () => { const exeFile = mockFile('test.exe', 'application/x-msdownload', 1024) await expect(service.validateManuscriptFile(exeFile)).rejects.toThrow('Unsupported file type') }) it('should enforce size limits', async () => { const largePdf = mockFile('large.pdf', 'application/pdf', 60 * 1024 * 1024) await expect(service.validateManuscriptFile(largePdf)).rejects.toThrow('File too large') }) }) describe('generateUploadPath', () => { it('should create consistent paths', () => { const path1 = service.generateUploadPath('user123', 'manuscript456', 1, 'book.pdf') const path2 = service.generateUploadPath('user123', 'manuscript456', 1, 'book.pdf') expect(path1).toBe(path2) expect(path1).toBe('manuscripts/user123/originals/manuscript456/v1.pdf') }) }) })

Integration Tests

// apps/analyzer-app/src/lib/storage/__tests__/upload-flow.integration.test.ts import { createTestUser, uploadTestFile } from '@mystoryflow/shared/test-utils' import { ManuscriptStorageService } from '../manuscript-storage' import { ContentExtractionService } from '../content-extractor' describe('Upload Flow Integration', () => { it('should complete full upload and extraction flow', async () => { const user = await createTestUser() const file = await uploadTestFile('sample-manuscript.pdf') const storage = new ManuscriptStorageService() const extractor = new ContentExtractionService() // Upload file const uploadResult = await storage.uploadManuscript( user.id, 'test-manuscript-id', file ) expect(uploadResult.success).toBe(true) // Extract content const extractResult = await extractor.extractContent( uploadResult.path, file.type, 'test-manuscript-id' ) expect(extractResult.wordCount).toBeGreaterThan(0) expect(extractResult.text).toBeTruthy() }) })

E2E Tests

// apps/analyzer-app/e2e/upload-manuscript.spec.ts import { test, expect } from '@playwright/test' test.describe('Manuscript Upload', () => { test('should upload PDF manuscript successfully', async ({ page }) => { await page.goto('/manuscripts/new') // Upload file const fileInput = page.locator('input[type="file"]') await fileInput.setInputFiles('e2e/fixtures/sample-manuscript.pdf') // Fill metadata await page.fill('[name="title"]', 'Test Manuscript') await page.selectOption('[name="genre"]', 'fantasy') // Submit await page.click('button[type="submit"]') // Verify upload progress await expect(page.locator('[data-testid="upload-progress"]')).toBeVisible() // Wait for completion await page.waitForSelector('[data-testid="upload-success"]', { timeout: 30000 }) // Verify file appears in list await page.goto('/manuscripts') await expect(page.locator('text=Test Manuscript')).toBeVisible() }) })

Security Considerations

File Security

  • All files encrypted at rest
  • Virus scanning before processing
  • User-specific access controls
  • Secure presigned URLs with expiration

Privacy Protection

  • Manuscript content never logged in plain text
  • Automatic deletion based on retention policy
  • GDPR-compliant data export/deletion
  • Audit trail for all file operations

Acceptance Criteria

Must Have

  • Support for PDF, DOCX, TXT, RTF, ODT files
  • File size validation and limits
  • Secure upload with virus scanning
  • Content extraction working for all formats
  • Version control for manuscript revisions
  • Encrypted storage with access controls

Should Have

  • CDN integration for fast report delivery
  • Automatic backup and archiving
  • Progress tracking for uploads
  • Thumbnail generation for PDFs
  • File metadata preservation

Could Have

  • Google Docs integration
  • Batch upload capabilities
  • Advanced file compression
  • Real-time collaboration features

Dependencies

  • F001-PROJECT-SETUP (environment configuration)
  • F002-DATABASE-SCHEMA (manuscript tables)
  • F000B-SHARED-PACKAGES (storage utilities from @mystoryflow/shared)
  • MyStoryFlow’s existing Backblaze B2 configuration

Estimated Effort

  • Development: 4 days
  • Testing: 2 days
  • Security Review: 1 day
  • Documentation: 1 day
  • Total: 8 days

Implementation Notes

Priority Order

  1. Integrate with @mystoryflow/shared storage utilities
  2. Implement file validation for manuscript-specific types
  3. Build content extraction pipeline with AI tracking
  4. Add virus scanning integration
  5. Set up CDN for report delivery
  6. Implement version control

Risk Considerations

  • Large file uploads may timeout - implement chunked uploads
  • Content extraction accuracy varies by file type - test all formats
  • Virus scanning adds processing delay - make it async
  • Storage costs scale with usage - monitor via admin-app

Storage Service Integration

// apps/analyzer-app/src/lib/supabase-client.ts import { createBrowserClient } from '@mystoryflow/auth/client' export const supabase = createBrowserClient() // apps/analyzer-app/src/lib/storage-client.ts import { createStorageClient } from '@mystoryflow/shared/storage' import { supabase } from './supabase-client' export const storage = createStorageClient({ supabase, bucket: process.env.NEXT_PUBLIC_BACKBLAZE_BUCKET_NAME!, cdnUrl: process.env.NEXT_PUBLIC_BACKBLAZE_ENDPOINT! })

Next Feature

After completion, proceed to F004-AI-SERVICES to set up AI service integration for manuscript analysis.