Skip to Content
📚 MyStoryFlow Docs — Your guide to preserving family stories

F008 - Text Segmentation for AI Processing

Objective

Intelligently segment manuscripts into optimal chunks for AI analysis, ensuring context preservation while staying within token limits.

Quick Implementation

Using MyStoryFlow Components

  • Background job processing from @mystoryflow/shared
  • Database transaction handling via @mystoryflow/database
  • Progress tracking utilities from @mystoryflow/ui
  • Error handling patterns from @mystoryflow/shared

New Requirements

  • Token counting for AI models
  • Smart chunking algorithm
  • Context overlap management
  • Chapter-aware segmentation

MVP Implementation

1. Database Schema

-- Text chunks for AI processing (in analyzer schema) CREATE TABLE analyzer.manuscript_chunks ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), manuscript_id UUID REFERENCES analyzer.manuscripts(id), chunk_index INTEGER NOT NULL, chunk_type VARCHAR(50) NOT NULL, -- 'chapter', 'section', 'overlap' content TEXT NOT NULL, token_count INTEGER NOT NULL, word_count INTEGER NOT NULL, start_position INTEGER NOT NULL, end_position INTEGER NOT NULL, metadata JSONB DEFAULT '{}', created_at TIMESTAMP DEFAULT NOW() ); -- Chunking sessions CREATE TABLE analyzer.chunking_sessions ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), manuscript_id UUID REFERENCES analyzer.manuscripts(id), total_chunks INTEGER, average_chunk_size INTEGER, chunking_strategy VARCHAR(50), processing_time_ms INTEGER, created_at TIMESTAMP DEFAULT NOW() ); -- Indexes CREATE INDEX idx_manuscript_chunks_manuscript_id ON analyzer.manuscript_chunks(manuscript_id); CREATE INDEX idx_manuscript_chunks_index ON analyzer.manuscript_chunks(manuscript_id, chunk_index);

2. Text Chunking Service

// packages/manuscript-analysis/src/services/text-chunker.ts import { encoding_for_model } from 'tiktoken' import { getSupabaseBrowserClient } from '@mystoryflow/database' import { trackAIUsage } from '@mystoryflow/analytics' interface ChunkingOptions { maxTokens: number overlapTokens: number chunkingStrategy: 'chapter' | 'smart' | 'fixed' preserveChapters: boolean } interface TextChunk { index: number type: 'chapter' | 'section' | 'overlap' content: string tokenCount: number wordCount: number startPosition: number endPosition: number metadata: { chapterTitle?: string chapterNumber?: number hasCliffhanger?: boolean } } export class TextChunker { private tokenEncoder: any private supabase = getSupabaseBrowserClient() // Default options for optimal AI processing private defaultOptions: ChunkingOptions = { maxTokens: 3000, // Safe limit for most AI models overlapTokens: 200, // Context preservation chunkingStrategy: 'smart', preserveChapters: true } constructor() { this.tokenEncoder = encoding_for_model('gpt-4') } async chunkManuscript( manuscriptId: string, options: Partial<ChunkingOptions> = {} ): Promise<void> { const opts = { ...this.defaultOptions, ...options } const startTime = Date.now() // Get manuscript content and structure const { data: content } = await this.supabase .from('analyzer.manuscript_content') .select('raw_text') .eq('manuscript_id', manuscriptId) .single() const { data: chapters } = await this.supabase .from('analyzer.manuscript_structure') .select('*') .eq('manuscript_id', manuscriptId) .eq('type', 'chapter') .order('sequence_number') if (!content) throw new Error('Content not found') // Choose chunking strategy let chunks: TextChunk[] if (opts.chunkingStrategy === 'chapter' && chapters?.length > 0) { chunks = this.chunkByChapters(chapters, opts) } else if (opts.chunkingStrategy === 'smart') { chunks = this.smartChunk(content.raw_text, chapters, opts) } else { chunks = this.fixedChunk(content.raw_text, opts) } // Save chunks to database await this.saveChunks(manuscriptId, chunks, opts, Date.now() - startTime) } private chunkByChapters( chapters: any[], options: ChunkingOptions ): TextChunk[] { const chunks: TextChunk[] = [] let chunkIndex = 0 for (const chapter of chapters) { const chapterTokens = this.countTokens(chapter.content) if (chapterTokens <= options.maxTokens) { // Chapter fits in one chunk chunks.push({ index: chunkIndex++, type: 'chapter', content: chapter.content, tokenCount: chapterTokens, wordCount: chapter.word_count, startPosition: chapter.start_position, endPosition: chapter.end_position, metadata: { chapterTitle: chapter.title, chapterNumber: chapter.sequence_number } }) } else { // Split large chapter into multiple chunks const chapterChunks = this.splitLargeChapter( chapter, options, chunkIndex ) chunks.push(...chapterChunks) chunkIndex += chapterChunks.length } } return chunks } private smartChunk( text: string, chapters: any[] | null, options: ChunkingOptions ): TextChunk[] { const chunks: TextChunk[] = [] let currentPosition = 0 let chunkIndex = 0 // Split text into sentences for smart breaking const sentences = this.splitIntoSentences(text) let currentChunk = '' let currentTokens = 0 let chunkStart = 0 for (let i = 0; i < sentences.length; i++) { const sentence = sentences[i] const sentenceTokens = this.countTokens(sentence) if (currentTokens + sentenceTokens > options.maxTokens) { // Save current chunk if (currentChunk.trim()) { chunks.push({ index: chunkIndex++, type: 'section', content: currentChunk.trim(), tokenCount: currentTokens, wordCount: this.countWords(currentChunk), startPosition: chunkStart, endPosition: currentPosition, metadata: this.detectChunkMetadata(currentChunk) }) } // Start new chunk with overlap const overlapSentences = this.getOverlapSentences( sentences, i, options.overlapTokens ) currentChunk = overlapSentences + sentence currentTokens = this.countTokens(currentChunk) chunkStart = currentPosition - overlapSentences.length } else { currentChunk += sentence currentTokens += sentenceTokens } currentPosition += sentence.length } // Save final chunk if (currentChunk.trim()) { chunks.push({ index: chunkIndex, type: 'section', content: currentChunk.trim(), tokenCount: currentTokens, wordCount: this.countWords(currentChunk), startPosition: chunkStart, endPosition: text.length, metadata: this.detectChunkMetadata(currentChunk) }) } return chunks } private fixedChunk( text: string, options: ChunkingOptions ): TextChunk[] { const chunks: TextChunk[] = [] const words = text.split(/\s+/) const wordsPerChunk = Math.floor(options.maxTokens * 0.75) // Rough estimate for (let i = 0; i < words.length; i += wordsPerChunk) { const chunkWords = words.slice(i, i + wordsPerChunk) const content = chunkWords.join(' ') chunks.push({ index: Math.floor(i / wordsPerChunk), type: 'section', content, tokenCount: this.countTokens(content), wordCount: chunkWords.length, startPosition: text.indexOf(chunkWords[0]), endPosition: text.indexOf(chunkWords[chunkWords.length - 1]) + chunkWords[chunkWords.length - 1].length, metadata: {} }) } return chunks } private splitLargeChapter( chapter: any, options: ChunkingOptions, startIndex: number ): TextChunk[] { const chunks: TextChunk[] = [] const sentences = this.splitIntoSentences(chapter.content) let currentChunk = '' let currentTokens = 0 let chunkNumber = 0 for (const sentence of sentences) { const sentenceTokens = this.countTokens(sentence) if (currentTokens + sentenceTokens > options.maxTokens) { chunks.push({ index: startIndex + chunkNumber, type: 'chapter', content: currentChunk.trim(), tokenCount: currentTokens, wordCount: this.countWords(currentChunk), startPosition: chapter.start_position, endPosition: chapter.end_position, metadata: { chapterTitle: chapter.title, chapterNumber: chapter.sequence_number, chunkPart: chunkNumber + 1 } }) chunkNumber++ currentChunk = sentence currentTokens = sentenceTokens } else { currentChunk += sentence currentTokens += sentenceTokens } } // Add final chunk if (currentChunk.trim()) { chunks.push({ index: startIndex + chunkNumber, type: 'chapter', content: currentChunk.trim(), tokenCount: currentTokens, wordCount: this.countWords(currentChunk), startPosition: chapter.start_position, endPosition: chapter.end_position, metadata: { chapterTitle: chapter.title, chapterNumber: chapter.sequence_number, chunkPart: chunkNumber + 1 } }) } return chunks } private splitIntoSentences(text: string): string[] { // Simple sentence splitter - can be improved const sentences = text.match(/[^.!?]+[.!?]+/g) || [] return sentences.map(s => s.trim() + ' ') } private getOverlapSentences( sentences: string[], currentIndex: number, overlapTokens: number ): string { let overlap = '' let tokens = 0 // Go backwards to collect overlap sentences for (let i = currentIndex - 1; i >= 0 && tokens < overlapTokens; i--) { const sentence = sentences[i] const sentenceTokens = this.countTokens(sentence) if (tokens + sentenceTokens <= overlapTokens) { overlap = sentence + overlap tokens += sentenceTokens } } return overlap } private detectChunkMetadata(content: string): any { const metadata: any = {} // Detect if chunk ends with cliffhanger const lastSentence = content.trim().split(/[.!?]/).slice(-2, -1)[0] || '' if (lastSentence.match(/suddenly|but then|however|unexpectedly/i)) { metadata.hasCliffhanger = true } return metadata } private countTokens(text: string): number { try { return this.tokenEncoder.encode(text).length } catch { // Fallback to word count estimation return Math.ceil(text.split(/\s+/).length * 1.3) } } private countWords(text: string): number { return text.trim().split(/\s+/).filter(word => word.length > 0).length } private async saveChunks( manuscriptId: string, chunks: TextChunk[], options: ChunkingOptions, processingTime: number ): Promise<void> { // Save chunking session const { data: session } = await this.supabase .from('analyzer.chunking_sessions') .insert({ manuscript_id: manuscriptId, total_chunks: chunks.length, average_chunk_size: Math.round( chunks.reduce((sum, chunk) => sum + chunk.tokenCount, 0) / chunks.length ), chunking_strategy: options.chunkingStrategy, processing_time_ms: processingTime }) .select() .single() // Save chunks await this.supabase .from('analyzer.manuscript_chunks') .insert( chunks.map(chunk => ({ manuscript_id: manuscriptId, chunk_index: chunk.index, chunk_type: chunk.type, content: chunk.content, token_count: chunk.tokenCount, word_count: chunk.wordCount, start_position: chunk.startPosition, end_position: chunk.endPosition, metadata: chunk.metadata })) ) // Update manuscript status await this.supabase .from('analyzer.manuscripts') .update({ status: 'chunked' }) .eq('id', manuscriptId) } private async getManuscriptUserId(manuscriptId: string): Promise<string> { const { data } = await this.supabase .from('analyzer.manuscripts') .select('user_id') .eq('id', manuscriptId) .single() return data?.user_id } }

3. API Endpoint

// apps/analyzer-app/src/app/api/manuscripts/[id]/chunk/route.ts import { NextRequest, NextResponse } from 'next/server' import { TextChunker } from '@mystoryflow/manuscript-analysis' import { withAuth } from '@mystoryflow/auth' export async function POST( req: NextRequest, { params }: { params: { id: string } } ) { const session = await withAuth(req) if (!session) { return NextResponse.json({ error: 'Unauthorized' }, { status: 401 }) } try { const { strategy = 'smart' } = await req.json() const chunker = new TextChunker() await chunker.chunkManuscript(params.id, { chunkingStrategy: strategy }) return NextResponse.json({ success: true, message: 'Manuscript chunked successfully' }) } catch (error) { console.error('Chunking error:', error) return NextResponse.json( { error: 'Chunking failed' }, { status: 500 } ) } }

MVP Acceptance Criteria

  • Smart text segmentation within token limits
  • Chapter-aware chunking option
  • Context overlap between chunks
  • Token counting for GPT-4/Claude
  • Support for 150k word manuscripts
  • Efficient chunk storage
  • Processing progress tracking

Post-MVP Enhancements

  • Scene-based chunking
  • Dynamic chunk sizing based on content
  • Multi-language tokenization
  • Semantic chunking using embeddings
  • Chunk caching for repeated analysis
  • Parallel chunk processing
  • Custom chunking strategies per genre

Implementation Time

  • Development: 1.5 days
  • Testing: 0.5 days
  • Total: 2 days

Dependencies

  • F006-CONTENT-EXTRACTION (text content required)
  • F007-GENRE-DETECTION (for genre-specific strategies)

Next Feature

After completion, proceed to F009-ANALYSIS-FRAMEWORK which is already documented.