F008 - Text Segmentation for AI Processing

Objective

Intelligently segment manuscripts into optimal chunks for AI analysis, ensuring context preservation while staying within token limits.

Quick Implementation

Using MyStoryFlow Components

Background job processing from @mystoryflow/shared
Database transaction handling via @mystoryflow/database
Progress tracking utilities from @mystoryflow/ui
Error handling patterns from @mystoryflow/shared

New Requirements

Token counting for AI models
Smart chunking algorithm
Context overlap management
Chapter-aware segmentation

MVP Implementation

1. Database Schema


-- Text chunks for AI processing (in analyzer schema)
CREATE TABLE analyzer.manuscript_chunks (
  id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
  manuscript_id UUID REFERENCES analyzer.manuscripts(id),
  chunk_index INTEGER NOT NULL,
  chunk_type VARCHAR(50) NOT NULL, -- 'chapter', 'section', 'overlap'
  content TEXT NOT NULL,
  token_count INTEGER NOT NULL,
  word_count INTEGER NOT NULL,
  start_position INTEGER NOT NULL,
  end_position INTEGER NOT NULL,
  metadata JSONB DEFAULT '{}',
  created_at TIMESTAMP DEFAULT NOW()
);
 
-- Chunking sessions
CREATE TABLE analyzer.chunking_sessions (
  id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
  manuscript_id UUID REFERENCES analyzer.manuscripts(id),
  total_chunks INTEGER,
  average_chunk_size INTEGER,
  chunking_strategy VARCHAR(50),
  processing_time_ms INTEGER,
  created_at TIMESTAMP DEFAULT NOW()
);
 
-- Indexes
CREATE INDEX idx_manuscript_chunks_manuscript_id ON analyzer.manuscript_chunks(manuscript_id);
CREATE INDEX idx_manuscript_chunks_index ON analyzer.manuscript_chunks(manuscript_id, chunk_index);

2. Text Chunking Service


// packages/manuscript-analysis/src/services/text-chunker.ts
import { encoding_for_model } from 'tiktoken'
import { getSupabaseBrowserClient } from '@mystoryflow/database'
import { trackAIUsage } from '@mystoryflow/analytics'
 
interface ChunkingOptions {
  maxTokens: number
  overlapTokens: number
  chunkingStrategy: 'chapter' | 'smart' | 'fixed'
  preserveChapters: boolean
}
 
interface TextChunk {
  index: number
  type: 'chapter' | 'section' | 'overlap'
  content: string
  tokenCount: number
  wordCount: number
  startPosition: number
  endPosition: number
  metadata: {
    chapterTitle?: string
    chapterNumber?: number
    hasCliffhanger?: boolean
  }
}
 
export class TextChunker {
  private tokenEncoder: any
  private supabase = getSupabaseBrowserClient()
  
  // Default options for optimal AI processing
  private defaultOptions: ChunkingOptions = {
    maxTokens: 3000, // Safe limit for most AI models
    overlapTokens: 200, // Context preservation
    chunkingStrategy: 'smart',
    preserveChapters: true
  }
 
  constructor() {
    this.tokenEncoder = encoding_for_model('gpt-4')
  }
 
  async chunkManuscript(
    manuscriptId: string, 
    options: Partial<ChunkingOptions> = {}
  ): Promise<void> {
    const opts = { ...this.defaultOptions, ...options }
    const startTime = Date.now()
 
    // Get manuscript content and structure
    const { data: content } = await this.supabase
      .from('analyzer.manuscript_content')
      .select('raw_text')
      .eq('manuscript_id', manuscriptId)
      .single()
 
    const { data: chapters } = await this.supabase
      .from('analyzer.manuscript_structure')
      .select('*')
      .eq('manuscript_id', manuscriptId)
      .eq('type', 'chapter')
      .order('sequence_number')
 
    if (!content) throw new Error('Content not found')
 
    // Choose chunking strategy
    let chunks: TextChunk[]
    
    if (opts.chunkingStrategy === 'chapter' && chapters?.length > 0) {
      chunks = this.chunkByChapters(chapters, opts)
    } else if (opts.chunkingStrategy === 'smart') {
      chunks = this.smartChunk(content.raw_text, chapters, opts)
    } else {
      chunks = this.fixedChunk(content.raw_text, opts)
    }
 
    // Save chunks to database
    await this.saveChunks(manuscriptId, chunks, opts, Date.now() - startTime)
  }
 
  private chunkByChapters(
    chapters: any[], 
    options: ChunkingOptions
  ): TextChunk[] {
    const chunks: TextChunk[] = []
    let chunkIndex = 0
 
    for (const chapter of chapters) {
      const chapterTokens = this.countTokens(chapter.content)
      
      if (chapterTokens <= options.maxTokens) {
        // Chapter fits in one chunk
        chunks.push({
          index: chunkIndex++,
          type: 'chapter',
          content: chapter.content,
          tokenCount: chapterTokens,
          wordCount: chapter.word_count,
          startPosition: chapter.start_position,
          endPosition: chapter.end_position,
          metadata: {
            chapterTitle: chapter.title,
            chapterNumber: chapter.sequence_number
          }
        })
      } else {
        // Split large chapter into multiple chunks
        const chapterChunks = this.splitLargeChapter(
          chapter,
          options,
          chunkIndex
        )
        chunks.push(...chapterChunks)
        chunkIndex += chapterChunks.length
      }
    }
 
    return chunks
  }
 
  private smartChunk(
    text: string,
    chapters: any[] | null,
    options: ChunkingOptions
  ): TextChunk[] {
    const chunks: TextChunk[] = []
    let currentPosition = 0
    let chunkIndex = 0
 
    // Split text into sentences for smart breaking
    const sentences = this.splitIntoSentences(text)
    let currentChunk = ''
    let currentTokens = 0
    let chunkStart = 0
 
    for (let i = 0; i < sentences.length; i++) {
      const sentence = sentences[i]
      const sentenceTokens = this.countTokens(sentence)
 
      if (currentTokens + sentenceTokens > options.maxTokens) {
        // Save current chunk
        if (currentChunk.trim()) {
          chunks.push({
            index: chunkIndex++,
            type: 'section',
            content: currentChunk.trim(),
            tokenCount: currentTokens,
            wordCount: this.countWords(currentChunk),
            startPosition: chunkStart,
            endPosition: currentPosition,
            metadata: this.detectChunkMetadata(currentChunk)
          })
        }
 
        // Start new chunk with overlap
        const overlapSentences = this.getOverlapSentences(
          sentences,
          i,
          options.overlapTokens
        )
        currentChunk = overlapSentences + sentence
        currentTokens = this.countTokens(currentChunk)
        chunkStart = currentPosition - overlapSentences.length
      } else {
        currentChunk += sentence
        currentTokens += sentenceTokens
      }
 
      currentPosition += sentence.length
    }
 
    // Save final chunk
    if (currentChunk.trim()) {
      chunks.push({
        index: chunkIndex,
        type: 'section',
        content: currentChunk.trim(),
        tokenCount: currentTokens,
        wordCount: this.countWords(currentChunk),
        startPosition: chunkStart,
        endPosition: text.length,
        metadata: this.detectChunkMetadata(currentChunk)
      })
    }
 
    return chunks
  }
 
  private fixedChunk(
    text: string,
    options: ChunkingOptions
  ): TextChunk[] {
    const chunks: TextChunk[] = []
    const words = text.split(/\s+/)
    const wordsPerChunk = Math.floor(options.maxTokens * 0.75) // Rough estimate
    
    for (let i = 0; i < words.length; i += wordsPerChunk) {
      const chunkWords = words.slice(i, i + wordsPerChunk)
      const content = chunkWords.join(' ')
      
      chunks.push({
        index: Math.floor(i / wordsPerChunk),
        type: 'section',
        content,
        tokenCount: this.countTokens(content),
        wordCount: chunkWords.length,
        startPosition: text.indexOf(chunkWords[0]),
        endPosition: text.indexOf(chunkWords[chunkWords.length - 1]) + 
                     chunkWords[chunkWords.length - 1].length,
        metadata: {}
      })
    }
 
    return chunks
  }
 
  private splitLargeChapter(
    chapter: any,
    options: ChunkingOptions,
    startIndex: number
  ): TextChunk[] {
    const chunks: TextChunk[] = []
    const sentences = this.splitIntoSentences(chapter.content)
    let currentChunk = ''
    let currentTokens = 0
    let chunkNumber = 0
 
    for (const sentence of sentences) {
      const sentenceTokens = this.countTokens(sentence)
      
      if (currentTokens + sentenceTokens > options.maxTokens) {
        chunks.push({
          index: startIndex + chunkNumber,
          type: 'chapter',
          content: currentChunk.trim(),
          tokenCount: currentTokens,
          wordCount: this.countWords(currentChunk),
          startPosition: chapter.start_position,
          endPosition: chapter.end_position,
          metadata: {
            chapterTitle: chapter.title,
            chapterNumber: chapter.sequence_number,
            chunkPart: chunkNumber + 1
          }
        })
        
        chunkNumber++
        currentChunk = sentence
        currentTokens = sentenceTokens
      } else {
        currentChunk += sentence
        currentTokens += sentenceTokens
      }
    }
 
    // Add final chunk
    if (currentChunk.trim()) {
      chunks.push({
        index: startIndex + chunkNumber,
        type: 'chapter',
        content: currentChunk.trim(),
        tokenCount: currentTokens,
        wordCount: this.countWords(currentChunk),
        startPosition: chapter.start_position,
        endPosition: chapter.end_position,
        metadata: {
          chapterTitle: chapter.title,
          chapterNumber: chapter.sequence_number,
          chunkPart: chunkNumber + 1
        }
      })
    }
 
    return chunks
  }
 
  private splitIntoSentences(text: string): string[] {
    // Simple sentence splitter - can be improved
    const sentences = text.match(/[^.!?]+[.!?]+/g) || []
    return sentences.map(s => s.trim() + ' ')
  }
 
  private getOverlapSentences(
    sentences: string[],
    currentIndex: number,
    overlapTokens: number
  ): string {
    let overlap = ''
    let tokens = 0
    
    // Go backwards to collect overlap sentences
    for (let i = currentIndex - 1; i >= 0 && tokens < overlapTokens; i--) {
      const sentence = sentences[i]
      const sentenceTokens = this.countTokens(sentence)
      
      if (tokens + sentenceTokens <= overlapTokens) {
        overlap = sentence + overlap
        tokens += sentenceTokens
      }
    }
    
    return overlap
  }
 
  private detectChunkMetadata(content: string): any {
    const metadata: any = {}
    
    // Detect if chunk ends with cliffhanger
    const lastSentence = content.trim().split(/[.!?]/).slice(-2, -1)[0] || ''
    if (lastSentence.match(/suddenly|but then|however|unexpectedly/i)) {
      metadata.hasCliffhanger = true
    }
    
    return metadata
  }
 
  private countTokens(text: string): number {
    try {
      return this.tokenEncoder.encode(text).length
    } catch {
      // Fallback to word count estimation
      return Math.ceil(text.split(/\s+/).length * 1.3)
    }
  }
 
  private countWords(text: string): number {
    return text.trim().split(/\s+/).filter(word => word.length > 0).length
  }
 
  private async saveChunks(
    manuscriptId: string,
    chunks: TextChunk[],
    options: ChunkingOptions,
    processingTime: number
  ): Promise<void> {
    // Save chunking session
    const { data: session } = await this.supabase
      .from('analyzer.chunking_sessions')
      .insert({
        manuscript_id: manuscriptId,
        total_chunks: chunks.length,
        average_chunk_size: Math.round(
          chunks.reduce((sum, chunk) => sum + chunk.tokenCount, 0) / chunks.length
        ),
        chunking_strategy: options.chunkingStrategy,
        processing_time_ms: processingTime
      })
      .select()
      .single()
 
    // Save chunks
    await this.supabase
      .from('analyzer.manuscript_chunks')
      .insert(
        chunks.map(chunk => ({
          manuscript_id: manuscriptId,
          chunk_index: chunk.index,
          chunk_type: chunk.type,
          content: chunk.content,
          token_count: chunk.tokenCount,
          word_count: chunk.wordCount,
          start_position: chunk.startPosition,
          end_position: chunk.endPosition,
          metadata: chunk.metadata
        }))
      )
 
    // Update manuscript status
    await this.supabase
      .from('analyzer.manuscripts')
      .update({ status: 'chunked' })
      .eq('id', manuscriptId)
  }
 
  private async getManuscriptUserId(manuscriptId: string): Promise<string> {
    const { data } = await this.supabase
      .from('analyzer.manuscripts')
      .select('user_id')
      .eq('id', manuscriptId)
      .single()
    return data?.user_id
  }
}

3. API Endpoint


// apps/analyzer-app/src/app/api/manuscripts/[id]/chunk/route.ts
import { NextRequest, NextResponse } from 'next/server'
import { TextChunker } from '@mystoryflow/manuscript-analysis'
import { withAuth } from '@mystoryflow/auth'
 
export async function POST(
  req: NextRequest,
  { params }: { params: { id: string } }
) {
  const session = await withAuth(req)
  if (!session) {
    return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
  }
 
  try {
    const { strategy = 'smart' } = await req.json()
    
    const chunker = new TextChunker()
    await chunker.chunkManuscript(params.id, {
      chunkingStrategy: strategy
    })
    
    return NextResponse.json({ 
      success: true,
      message: 'Manuscript chunked successfully'
    })
  } catch (error) {
    console.error('Chunking error:', error)
    return NextResponse.json(
      { error: 'Chunking failed' },
      { status: 500 }
    )
  }
}

MVP Acceptance Criteria

Smart text segmentation within token limits
Chapter-aware chunking option
Context overlap between chunks
Token counting for GPT-4/Claude
Support for 150k word manuscripts
Efficient chunk storage
Processing progress tracking

Post-MVP Enhancements

Scene-based chunking
Dynamic chunk sizing based on content
Multi-language tokenization
Semantic chunking using embeddings
Chunk caching for repeated analysis
Parallel chunk processing
Custom chunking strategies per genre

Implementation Time

Development: 1.5 days
Testing: 0.5 days
Total: 2 days

Dependencies

F006-CONTENT-EXTRACTION (text content required)
F007-GENRE-DETECTION (for genre-specific strategies)

Next Feature

After completion, proceed to F009-ANALYSIS-FRAMEWORK which is already documented.