Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

274 feature azure document intelligence as a segmentation model #302

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion apps/web/.env.example
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
VITE_API_URL=
# VITE_API_KEY=

VITE_KEYCLOAK_URL=
VITE_KEYCLOAK_REALM=
VITE_KEYCLOAK_CLIENT_ID=
VITE_KEYCLOAK_REDIRECT_URI=http://localhost:5173
VITE_KEYCLOAK_POST_LOGOUT_REDIRECT_URI=http://localhost:5173
VITE_KEYCLOAK_POST_LOGOUT_REDIRECT_URI=http://localhost:5173
21 changes: 11 additions & 10 deletions apps/web/src/components/Upload/ConfigControls.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -171,21 +171,23 @@ export function SegmentProcessingControls({
onChange,
showOnlyPage = false,
}: SegmentProcessingControlsProps) {
const [selectedType, setSelectedType] =
useState<keyof SegmentProcessing>("Text");
const [isDropdownOpen, setIsDropdownOpen] = useState(false);
const segmentTypes = showOnlyPage
? (["Page"] as (keyof SegmentProcessing)[])
: (Object.keys(value).filter(
(key) => key !== "Page"
) as (keyof SegmentProcessing)[]);
: (Object.keys(value)
.filter((key) => key !== "Page")
.sort() as (keyof SegmentProcessing)[]);

const defaultSegmentType = segmentTypes[0];
const [selectedType, setSelectedType] =
useState<keyof SegmentProcessing>(defaultSegmentType);
const [isDropdownOpen, setIsDropdownOpen] = useState(false);
const dropdownRef = useRef<HTMLDivElement>(null);

useEffect(() => {
if (showOnlyPage && selectedType !== "Page") {
setSelectedType("Page");
} else if (!showOnlyPage && selectedType === "Page") {
setSelectedType("Text"); // or any other default segment type
setSelectedType(defaultSegmentType);
}
}, [selectedType, showOnlyPage]);

Expand Down Expand Up @@ -270,9 +272,8 @@ export function SegmentProcessingControls({
{segmentTypes.map((type) => (
<button
key={type}
className={`segment-dropdown-item ${
selectedType === type ? "active" : ""
} ${isSegmentModified(type) ? "modified" : ""}`}
className={`segment-dropdown-item ${selectedType === type ? "active" : ""
} ${isSegmentModified(type) ? "modified" : ""}`}
onClick={() => handleTypeSelect(type)}
type="button"
>
Expand Down
42 changes: 42 additions & 0 deletions apps/web/src/components/Upload/UploadMain.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import {
SegmentationStrategy,
DEFAULT_UPLOAD_CONFIG,
DEFAULT_SEGMENT_PROCESSING,
Pipeline,
} from "../../models/taskConfig.model";
import "./UploadMain.css";
import Upload from "./Upload";
Expand All @@ -17,6 +18,7 @@ import {
} from "./ConfigControls";
import { uploadFile } from "../../services/uploadFileApi";
import { UploadForm } from "../../models/upload.model";
import { getEnvConfig, WhenEnabled } from "../../config/env.config";

interface UploadMainProps {
onSubmit: (config: UploadFormData) => void;
Expand All @@ -28,6 +30,7 @@ export default function UploadMain({
isAuthenticated,
onUploadSuccess,
}: UploadMainProps) {
const { features } = getEnvConfig();
const [files, setFiles] = useState<File[]>([]);
const [config, setConfig] = useState<UploadFormData>(DEFAULT_UPLOAD_CONFIG);
const [isUploading, setIsUploading] = useState(false);
Expand Down Expand Up @@ -71,6 +74,7 @@ export default function UploadMain({
ocr_strategy: config.ocr_strategy,
segment_processing: getEffectiveSegmentProcessing(config),
segmentation_strategy: config.segmentation_strategy,
pipeline: config.pipeline,
};

const response = await uploadFile(uploadPayload);
Expand Down Expand Up @@ -111,6 +115,44 @@ export default function UploadMain({
className={`config-section ${!isAuthenticated ? "disabled" : ""}`}
>
<div className="config-grid">
{features.pipeline && (
<ToggleGroup
label={
<Flex gap="2" align="center">
<svg
width="20px"
height="20px"
viewBox="0 0 16 16"
xmlns="http://www.w3.org/2000/svg"
fill="none"
>
<path
fill="#FFF"
fill-rule="evenodd"
d="M2.75 2.5A1.75 1.75 0 001 4.25v1C1 6.216 1.784 7 2.75 7h1a1.75 1.75 0 001.732-1.5H6.5a.75.75 0 01.75.75v3.5A2.25 2.25 0 009.5 12h1.018c.121.848.85 1.5 1.732 1.5h1A1.75 1.75 0 0015 11.75v-1A1.75 1.75 0 0013.25 9h-1a1.75 1.75 0 00-1.732 1.5H9.5a.75.75 0 01-.75-.75v-3.5A2.25 2.25 0 006.5 4H5.482A1.75 1.75 0 003.75 2.5h-1zM2.5 4.25A.25.25 0 012.75 4h1a.25.25 0 01.25.25v1a.25.25 0 01-.25.25h-1a.25.25 0 01-.25-.25v-1zm9.75 6.25a.25.25 0 00-.25.25v1c0 .138.112.25.25.25h1a.25.25 0 00.25-.25v-1a.25.25 0 00-.25-.25h-1z"
clip-rule="evenodd"
/>
</svg>
<span>Pipeline</span>
</Flex>
}
value={config.pipeline || "Default"}
onChange={(value) =>
setConfig({
...config,
pipeline: (features.pipeline
? value === "Default"
? undefined
: (value as Pipeline)
: undefined) as WhenEnabled<"pipeline", Pipeline>,
})
}
options={[
{ label: "Default", value: "Default" },
{ label: "Azure", value: Pipeline.Azure },
]}
/>
)}
<ToggleGroup
label={
<Flex gap="2" align="center">
Expand Down
37 changes: 37 additions & 0 deletions apps/web/src/config/env.config.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
export interface Features {
pipeline: boolean;
// Add new feature flags here
// example: betaFeature: boolean;
}

export interface EnvConfig {
features: Features;
}

export const getEnvConfig = (): EnvConfig => {
return {
features: {
pipeline: import.meta.env.VITE_FEATURE_FLAG_PIPELINE === "true",
// Add new feature implementations here
},
};
};

export function validateEnvConfig(): void {
const requiredFlags: Array<keyof Features> = ["pipeline"];

for (const flag of requiredFlags) {
const value = import.meta.env[`VITE_FEATURE_FLAG_${flag.toUpperCase()}`];
if (value !== "true" && value !== "false") {
throw new Error(
`VITE_FEATURE_FLAG_${flag.toUpperCase()} must be either "true" or "false"`,
);
}
}
}

// Type helper for feature-guarded types
export type WhenEnabled<
Flag extends keyof Features,
T,
> = Features[Flag] extends true ? T | undefined : undefined;
14 changes: 12 additions & 2 deletions apps/web/src/models/taskConfig.model.ts
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,8 @@ export interface JsonSchema {
schema_type?: string;
}

import { WhenEnabled } from "../config/env.config";

export interface UploadFormData {
/** Optional chunk processing configuration */
chunk_processing?: ChunkProcessing;
Expand Down Expand Up @@ -178,6 +180,13 @@ export interface UploadFormData {
* @default 512
*/
target_chunk_length?: number;

/** Pipeline to run after processing */
pipeline?: WhenEnabled<"pipeline", Pipeline>;
}

export enum Pipeline {
Azure = "Azure",
}

const DEFAULT_SEGMENT_CONFIG: SegmentProcessingConfig = {
Expand All @@ -200,8 +209,8 @@ const DEFAULT_FORMULA_CONFIG: SegmentProcessingConfig = {

const DEFAULT_PICTURE_CONFIG: SegmentProcessingConfig = {
crop_image: CroppingStrategy.All,
html: GenerationStrategy.LLM,
markdown: GenerationStrategy.LLM,
html: GenerationStrategy.Auto,
markdown: GenerationStrategy.Auto,
};

export const DEFAULT_SEGMENT_PROCESSING: SegmentProcessing = {
Expand All @@ -227,4 +236,5 @@ export const DEFAULT_UPLOAD_CONFIG: UploadFormData = {
segment_processing: DEFAULT_SEGMENT_PROCESSING,
json_schema: undefined, // or some default schema if needed
file: new File([], ""),
pipeline: undefined as WhenEnabled<"pipeline", Pipeline>, // Default pipeline
};
5 changes: 5 additions & 0 deletions apps/web/src/models/upload.model.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ import {
OcrStrategy,
SegmentProcessing,
SegmentationStrategy,
Pipeline,
} from "./taskConfig.model";
import { WhenEnabled } from "../config/env.config";

export interface UploadForm {
/** The file to be uploaded */
Expand All @@ -30,4 +32,7 @@ export interface UploadForm {

/** Strategy for document segmentation */
segmentation_strategy?: SegmentationStrategy;

/** Pipeline to run after processing */
pipeline?: WhenEnabled<"pipeline", Pipeline>;
}
6 changes: 6 additions & 0 deletions clients/node-client/jest.load.config.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
module.exports = {
preset: "ts-jest",
testEnvironment: "node",
testMatch: ["**/__tests__/**/*.load.test.ts"],
testTimeout: 300000, // 5 minute timeout for load tests
};
3 changes: 3 additions & 0 deletions clients/python-client/src/chunkr_ai/api/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,8 @@ class Model(str, Enum):
FAST = "Fast"
HIGH_QUALITY = "HighQuality"

class PipelineType(str, Enum):
AZURE = "Azure"

class Configuration(BaseModel):
chunk_processing: Optional[ChunkProcessing] = Field(default=None)
Expand All @@ -139,6 +141,7 @@ class Configuration(BaseModel):
ocr_strategy: Optional[OcrStrategy] = Field(default=None)
segment_processing: Optional[SegmentProcessing] = Field(default=None)
segmentation_strategy: Optional[SegmentationStrategy] = Field(default=None)
pipeline: Optional[PipelineType] = Field(default=None)

@model_validator(mode="before")
def map_deprecated_fields(cls, values: Dict) -> Dict:
Expand Down
2 changes: 2 additions & 0 deletions clients/python-client/src/chunkr_ai/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
SegmentType,
SegmentationStrategy,
Status,
PipelineType,
)

from .api.task import TaskResponse
Expand Down Expand Up @@ -45,4 +46,5 @@
"Status",
"TaskResponse",
"TaskResponseAsync",
"PipelineType",
]
15 changes: 15 additions & 0 deletions clients/python-client/tests/test_chunkr.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
GenerationConfig,
JsonSchema,
OcrStrategy,
PipelineType,
Property,
SegmentationStrategy,
SegmentProcessing,
Expand Down Expand Up @@ -411,3 +412,17 @@ async def test_update_task_direct(chunkr_client, sample_path):
assert task.status == "Succeeded"
assert task.output is not None
assert task.configuration.segmentation_strategy == SegmentationStrategy.PAGE


@pytest.mark.asyncio
async def test_pipeline_type(chunkr_client, sample_path):
client_type, client = chunkr_client
response = (
await client.upload(sample_path, Configuration(pipeline=PipelineType.AZURE))
if client_type == "async"
else client.upload(sample_path, Configuration(pipeline=PipelineType.AZURE))
)

assert response.task_id is not None
assert response.status == "Succeeded"
assert response.output is not None
6 changes: 3 additions & 3 deletions compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ services:
env_file:
- .env
deploy:
replicas: 1
replicas: 0
restart: always

task:
Expand All @@ -121,7 +121,7 @@ services:
env_file:
- .env
deploy:
replicas: 1
replicas: 0
restart: always

web:
Expand All @@ -132,7 +132,7 @@ services:
env_file:
- .env
deploy:
replicas: 1
replicas: 0
restart: always
segmentation:
image: luminainc/segmentation:df6e5375
Expand Down
4 changes: 3 additions & 1 deletion core/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,6 @@ cmd.txt

pdfium-binaries

.cargo
.cargo

azure-analysis-response.json
3 changes: 3 additions & 0 deletions core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ version = "0.0.0"
edition = "2021"
default-run = "core"

[features]
azure = []

[dependencies]
actix-cors = "0.7.0"
actix-multipart = "0.7.2"
Expand Down
2 changes: 2 additions & 0 deletions core/migrations/.gitkeep
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ALTER TABLE TASKS
ADD COLUMN version TEXT;
36 changes: 36 additions & 0 deletions core/src/configs/azure_config.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
use config::{Config as ConfigTrait, ConfigError};
use dotenvy::dotenv_override;
use serde::{Deserialize, Serialize};

#[derive(Debug, Serialize, Deserialize)]
pub struct Config {
#[serde(default = "default_api_version")]
pub api_version: String,
pub endpoint: String,
pub key: String,
#[serde(default = "default_model_id")]
pub model_id: String,
}

fn default_api_version() -> String {
"2024-11-30".to_string()
}

fn default_model_id() -> String {
"prebuilt-layout".to_string()
}

impl Config {
pub fn from_env() -> Result<Self, ConfigError> {
dotenv_override().ok();

ConfigTrait::builder()
.add_source(
config::Environment::default()
.prefix("AZURE")
.separator("__"),
)
.build()?
.try_deserialize()
}
}
3 changes: 3 additions & 0 deletions core/src/configs/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,6 @@ pub mod stripe_config;
pub mod throttle_config;
pub mod user_config;
pub mod worker_config;

#[cfg(feature = "azure")]
pub mod azure_config;
1 change: 0 additions & 1 deletion core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ pub mod pipeline;
pub mod routes;
pub mod utils;

use configs::worker_config;
use jobs::init::init_jobs;
use middleware::auth::AuthMiddlewareFactory;
use routes::github::get_github_repo_info;
Expand Down
Loading
Loading