F008 - Text Segmentation for AI Processing
Objective
Intelligently segment manuscripts into optimal chunks for AI analysis, ensuring context preservation while staying within token limits.
Quick Implementation
Using MyStoryFlow Components
- Background job processing from
@mystoryflow/shared - Database transaction handling via
@mystoryflow/database - Progress tracking utilities from
@mystoryflow/ui - Error handling patterns from
@mystoryflow/shared
New Requirements
- Token counting for AI models
- Smart chunking algorithm
- Context overlap management
- Chapter-aware segmentation
MVP Implementation
1. Database Schema
-- Text chunks for AI processing (in analyzer schema)
CREATE TABLE analyzer.manuscript_chunks (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
manuscript_id UUID REFERENCES analyzer.manuscripts(id),
chunk_index INTEGER NOT NULL,
chunk_type VARCHAR(50) NOT NULL, -- 'chapter', 'section', 'overlap'
content TEXT NOT NULL,
token_count INTEGER NOT NULL,
word_count INTEGER NOT NULL,
start_position INTEGER NOT NULL,
end_position INTEGER NOT NULL,
metadata JSONB DEFAULT '{}',
created_at TIMESTAMP DEFAULT NOW()
);
-- Chunking sessions
CREATE TABLE analyzer.chunking_sessions (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
manuscript_id UUID REFERENCES analyzer.manuscripts(id),
total_chunks INTEGER,
average_chunk_size INTEGER,
chunking_strategy VARCHAR(50),
processing_time_ms INTEGER,
created_at TIMESTAMP DEFAULT NOW()
);
-- Indexes
CREATE INDEX idx_manuscript_chunks_manuscript_id ON analyzer.manuscript_chunks(manuscript_id);
CREATE INDEX idx_manuscript_chunks_index ON analyzer.manuscript_chunks(manuscript_id, chunk_index);2. Text Chunking Service
// packages/manuscript-analysis/src/services/text-chunker.ts
import { encoding_for_model } from 'tiktoken'
import { getSupabaseBrowserClient } from '@mystoryflow/database'
import { trackAIUsage } from '@mystoryflow/analytics'
interface ChunkingOptions {
maxTokens: number
overlapTokens: number
chunkingStrategy: 'chapter' | 'smart' | 'fixed'
preserveChapters: boolean
}
interface TextChunk {
index: number
type: 'chapter' | 'section' | 'overlap'
content: string
tokenCount: number
wordCount: number
startPosition: number
endPosition: number
metadata: {
chapterTitle?: string
chapterNumber?: number
hasCliffhanger?: boolean
}
}
export class TextChunker {
private tokenEncoder: any
private supabase = getSupabaseBrowserClient()
// Default options for optimal AI processing
private defaultOptions: ChunkingOptions = {
maxTokens: 3000, // Safe limit for most AI models
overlapTokens: 200, // Context preservation
chunkingStrategy: 'smart',
preserveChapters: true
}
constructor() {
this.tokenEncoder = encoding_for_model('gpt-4')
}
async chunkManuscript(
manuscriptId: string,
options: Partial<ChunkingOptions> = {}
): Promise<void> {
const opts = { ...this.defaultOptions, ...options }
const startTime = Date.now()
// Get manuscript content and structure
const { data: content } = await this.supabase
.from('analyzer.manuscript_content')
.select('raw_text')
.eq('manuscript_id', manuscriptId)
.single()
const { data: chapters } = await this.supabase
.from('analyzer.manuscript_structure')
.select('*')
.eq('manuscript_id', manuscriptId)
.eq('type', 'chapter')
.order('sequence_number')
if (!content) throw new Error('Content not found')
// Choose chunking strategy
let chunks: TextChunk[]
if (opts.chunkingStrategy === 'chapter' && chapters?.length > 0) {
chunks = this.chunkByChapters(chapters, opts)
} else if (opts.chunkingStrategy === 'smart') {
chunks = this.smartChunk(content.raw_text, chapters, opts)
} else {
chunks = this.fixedChunk(content.raw_text, opts)
}
// Save chunks to database
await this.saveChunks(manuscriptId, chunks, opts, Date.now() - startTime)
}
private chunkByChapters(
chapters: any[],
options: ChunkingOptions
): TextChunk[] {
const chunks: TextChunk[] = []
let chunkIndex = 0
for (const chapter of chapters) {
const chapterTokens = this.countTokens(chapter.content)
if (chapterTokens <= options.maxTokens) {
// Chapter fits in one chunk
chunks.push({
index: chunkIndex++,
type: 'chapter',
content: chapter.content,
tokenCount: chapterTokens,
wordCount: chapter.word_count,
startPosition: chapter.start_position,
endPosition: chapter.end_position,
metadata: {
chapterTitle: chapter.title,
chapterNumber: chapter.sequence_number
}
})
} else {
// Split large chapter into multiple chunks
const chapterChunks = this.splitLargeChapter(
chapter,
options,
chunkIndex
)
chunks.push(...chapterChunks)
chunkIndex += chapterChunks.length
}
}
return chunks
}
private smartChunk(
text: string,
chapters: any[] | null,
options: ChunkingOptions
): TextChunk[] {
const chunks: TextChunk[] = []
let currentPosition = 0
let chunkIndex = 0
// Split text into sentences for smart breaking
const sentences = this.splitIntoSentences(text)
let currentChunk = ''
let currentTokens = 0
let chunkStart = 0
for (let i = 0; i < sentences.length; i++) {
const sentence = sentences[i]
const sentenceTokens = this.countTokens(sentence)
if (currentTokens + sentenceTokens > options.maxTokens) {
// Save current chunk
if (currentChunk.trim()) {
chunks.push({
index: chunkIndex++,
type: 'section',
content: currentChunk.trim(),
tokenCount: currentTokens,
wordCount: this.countWords(currentChunk),
startPosition: chunkStart,
endPosition: currentPosition,
metadata: this.detectChunkMetadata(currentChunk)
})
}
// Start new chunk with overlap
const overlapSentences = this.getOverlapSentences(
sentences,
i,
options.overlapTokens
)
currentChunk = overlapSentences + sentence
currentTokens = this.countTokens(currentChunk)
chunkStart = currentPosition - overlapSentences.length
} else {
currentChunk += sentence
currentTokens += sentenceTokens
}
currentPosition += sentence.length
}
// Save final chunk
if (currentChunk.trim()) {
chunks.push({
index: chunkIndex,
type: 'section',
content: currentChunk.trim(),
tokenCount: currentTokens,
wordCount: this.countWords(currentChunk),
startPosition: chunkStart,
endPosition: text.length,
metadata: this.detectChunkMetadata(currentChunk)
})
}
return chunks
}
private fixedChunk(
text: string,
options: ChunkingOptions
): TextChunk[] {
const chunks: TextChunk[] = []
const words = text.split(/\s+/)
const wordsPerChunk = Math.floor(options.maxTokens * 0.75) // Rough estimate
for (let i = 0; i < words.length; i += wordsPerChunk) {
const chunkWords = words.slice(i, i + wordsPerChunk)
const content = chunkWords.join(' ')
chunks.push({
index: Math.floor(i / wordsPerChunk),
type: 'section',
content,
tokenCount: this.countTokens(content),
wordCount: chunkWords.length,
startPosition: text.indexOf(chunkWords[0]),
endPosition: text.indexOf(chunkWords[chunkWords.length - 1]) +
chunkWords[chunkWords.length - 1].length,
metadata: {}
})
}
return chunks
}
private splitLargeChapter(
chapter: any,
options: ChunkingOptions,
startIndex: number
): TextChunk[] {
const chunks: TextChunk[] = []
const sentences = this.splitIntoSentences(chapter.content)
let currentChunk = ''
let currentTokens = 0
let chunkNumber = 0
for (const sentence of sentences) {
const sentenceTokens = this.countTokens(sentence)
if (currentTokens + sentenceTokens > options.maxTokens) {
chunks.push({
index: startIndex + chunkNumber,
type: 'chapter',
content: currentChunk.trim(),
tokenCount: currentTokens,
wordCount: this.countWords(currentChunk),
startPosition: chapter.start_position,
endPosition: chapter.end_position,
metadata: {
chapterTitle: chapter.title,
chapterNumber: chapter.sequence_number,
chunkPart: chunkNumber + 1
}
})
chunkNumber++
currentChunk = sentence
currentTokens = sentenceTokens
} else {
currentChunk += sentence
currentTokens += sentenceTokens
}
}
// Add final chunk
if (currentChunk.trim()) {
chunks.push({
index: startIndex + chunkNumber,
type: 'chapter',
content: currentChunk.trim(),
tokenCount: currentTokens,
wordCount: this.countWords(currentChunk),
startPosition: chapter.start_position,
endPosition: chapter.end_position,
metadata: {
chapterTitle: chapter.title,
chapterNumber: chapter.sequence_number,
chunkPart: chunkNumber + 1
}
})
}
return chunks
}
private splitIntoSentences(text: string): string[] {
// Simple sentence splitter - can be improved
const sentences = text.match(/[^.!?]+[.!?]+/g) || []
return sentences.map(s => s.trim() + ' ')
}
private getOverlapSentences(
sentences: string[],
currentIndex: number,
overlapTokens: number
): string {
let overlap = ''
let tokens = 0
// Go backwards to collect overlap sentences
for (let i = currentIndex - 1; i >= 0 && tokens < overlapTokens; i--) {
const sentence = sentences[i]
const sentenceTokens = this.countTokens(sentence)
if (tokens + sentenceTokens <= overlapTokens) {
overlap = sentence + overlap
tokens += sentenceTokens
}
}
return overlap
}
private detectChunkMetadata(content: string): any {
const metadata: any = {}
// Detect if chunk ends with cliffhanger
const lastSentence = content.trim().split(/[.!?]/).slice(-2, -1)[0] || ''
if (lastSentence.match(/suddenly|but then|however|unexpectedly/i)) {
metadata.hasCliffhanger = true
}
return metadata
}
private countTokens(text: string): number {
try {
return this.tokenEncoder.encode(text).length
} catch {
// Fallback to word count estimation
return Math.ceil(text.split(/\s+/).length * 1.3)
}
}
private countWords(text: string): number {
return text.trim().split(/\s+/).filter(word => word.length > 0).length
}
private async saveChunks(
manuscriptId: string,
chunks: TextChunk[],
options: ChunkingOptions,
processingTime: number
): Promise<void> {
// Save chunking session
const { data: session } = await this.supabase
.from('analyzer.chunking_sessions')
.insert({
manuscript_id: manuscriptId,
total_chunks: chunks.length,
average_chunk_size: Math.round(
chunks.reduce((sum, chunk) => sum + chunk.tokenCount, 0) / chunks.length
),
chunking_strategy: options.chunkingStrategy,
processing_time_ms: processingTime
})
.select()
.single()
// Save chunks
await this.supabase
.from('analyzer.manuscript_chunks')
.insert(
chunks.map(chunk => ({
manuscript_id: manuscriptId,
chunk_index: chunk.index,
chunk_type: chunk.type,
content: chunk.content,
token_count: chunk.tokenCount,
word_count: chunk.wordCount,
start_position: chunk.startPosition,
end_position: chunk.endPosition,
metadata: chunk.metadata
}))
)
// Update manuscript status
await this.supabase
.from('analyzer.manuscripts')
.update({ status: 'chunked' })
.eq('id', manuscriptId)
}
private async getManuscriptUserId(manuscriptId: string): Promise<string> {
const { data } = await this.supabase
.from('analyzer.manuscripts')
.select('user_id')
.eq('id', manuscriptId)
.single()
return data?.user_id
}
}3. API Endpoint
// apps/analyzer-app/src/app/api/manuscripts/[id]/chunk/route.ts
import { NextRequest, NextResponse } from 'next/server'
import { TextChunker } from '@mystoryflow/manuscript-analysis'
import { withAuth } from '@mystoryflow/auth'
export async function POST(
req: NextRequest,
{ params }: { params: { id: string } }
) {
const session = await withAuth(req)
if (!session) {
return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
}
try {
const { strategy = 'smart' } = await req.json()
const chunker = new TextChunker()
await chunker.chunkManuscript(params.id, {
chunkingStrategy: strategy
})
return NextResponse.json({
success: true,
message: 'Manuscript chunked successfully'
})
} catch (error) {
console.error('Chunking error:', error)
return NextResponse.json(
{ error: 'Chunking failed' },
{ status: 500 }
)
}
}MVP Acceptance Criteria
- Smart text segmentation within token limits
- Chapter-aware chunking option
- Context overlap between chunks
- Token counting for GPT-4/Claude
- Support for 150k word manuscripts
- Efficient chunk storage
- Processing progress tracking
Post-MVP Enhancements
- Scene-based chunking
- Dynamic chunk sizing based on content
- Multi-language tokenization
- Semantic chunking using embeddings
- Chunk caching for repeated analysis
- Parallel chunk processing
- Custom chunking strategies per genre
Implementation Time
- Development: 1.5 days
- Testing: 0.5 days
- Total: 2 days
Dependencies
- F006-CONTENT-EXTRACTION (text content required)
- F007-GENRE-DETECTION (for genre-specific strategies)
Next Feature
After completion, proceed to F009-ANALYSIS-FRAMEWORK which is already documented.