Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions libs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,8 @@ The following types of information can be extracted:
- `TABLE`: data in tabular form found in the document
- `IMAGE`: image found in the document

For Confluence sources, provide the instance `url` and API `token` and include either a `space_key` or a `cql` filter (empty values are ignored). Optional flags such as `include_attachments`, `keep_markdown_format`, and `keep_newlines` mirror the parameters supported by LangChain's `ConfluenceLoader`.

For sitemap sources, additional parameters can be provided, e.g.:

- `web_path`: The URL of the XML sitemap to crawl
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from extractor_api_lib.impl.mapper.confluence_langchain_document2information_piece import (
ConfluenceLangchainDocument2InformationPiece,
)
from langchain_community.document_loaders.confluence import ContentFormat


logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -54,11 +56,28 @@ async def aextract_content(
A list of information pieces extracted from Confluence.
"""
# Convert list of key value pairs to dict
confluence_loader_parameters = {
x.key: int(x.value) if x.value.isdigit() else x.value for x in extraction_parameters.kwargs
}
if not confluence_loader_parameters.get("max_pages") or isinstance(
confluence_loader_parameters.get("max_pages"), str
confluence_loader_parameters = {}
for key_value in extraction_parameters.kwargs or []:
if key_value is None or key_value.key is None:
continue

value = key_value.value
if isinstance(value, str):
value = value.strip()
if not value and key_value.key in {"space_key", "cql"}:
# Skip empty optional parameters
continue
if value.isdigit():
value = int(value)

confluence_loader_parameters[key_value.key] = value

if "cql" not in confluence_loader_parameters and "space_key" not in confluence_loader_parameters:
raise ValueError("Either 'space_key' or 'cql' must be provided for Confluence extraction.")
if (
"max_pages" in confluence_loader_parameters
and not confluence_loader_parameters.get("max_pages")
or isinstance(confluence_loader_parameters.get("max_pages"), str)
):
logging.warning(
"max_pages parameter is not set or invalid discarding it. ConfluenceLoader will use default value."
Expand All @@ -67,6 +86,7 @@ async def aextract_content(
# Drop the document_name parameter as it is not used by the ConfluenceLoader
if "document_name" in confluence_loader_parameters:
confluence_loader_parameters.pop("document_name", None)
confluence_loader_parameters["content_format"] = ContentFormat.VIEW
document_loader = ConfluenceLoader(**confluence_loader_parameters)
documents = document_loader.load()
return [self._mapper.map_document2informationpiece(x, extraction_parameters.document_name) for x in documents]
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,11 @@ class ConfluenceParameters(BaseModel):

url: StrictStr = Field(description="url of the confluence space.")
token: StrictStr = Field(description="api key to access confluence.")
space_key: StrictStr = Field(description="the space key of the confluence pages.")
space_key: Optional[StrictStr] = Field(default=None, description="the space key of the confluence pages.")
cql: Optional[StrictStr] = Field(
default=None,
description="Optional Confluence Query Language (CQL) expression used to filter pages.",
)
include_attachments: Optional[StrictBool] = Field(
default=False,
description="whether to include file attachments (e.g., images, documents) in the parsed content. Default is `false`.",
Expand All @@ -55,6 +59,7 @@ class ConfluenceParameters(BaseModel):
"url",
"token",
"space_key",
"cql",
"include_attachments",
"keep_markdown_format",
"keep_newlines",
Expand Down Expand Up @@ -120,6 +125,7 @@ def from_dict(cls, obj: Dict) -> Self:
"url": obj.get("url"),
"token": obj.get("token"),
"space_key": obj.get("space_key"),
"cql": obj.get("cql"),
"include_attachments": (
obj.get("include_attachments") if obj.get("include_attachments") is not None else False
),
Expand Down
68 changes: 68 additions & 0 deletions libs/extractor-api-lib/tests/confluence_extractor_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
"""Tests for the ConfluenceExtractor."""

import pytest
from unittest.mock import MagicMock, patch
from langchain_core.documents import Document as LangchainDocument

from extractor_api_lib.impl.extractors.confluence_extractor import ConfluenceExtractor
from extractor_api_lib.models.extraction_parameters import ExtractionParameters
from extractor_api_lib.models.key_value_pair import KeyValuePair
from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece
from extractor_api_lib.impl.types.content_type import ContentType


@pytest.fixture
def confluence_mapper():
"""Return a mapper mock that produces predictable information pieces."""
mapper = MagicMock()
mapper.map_document2informationpiece.return_value = InternalInformationPiece(
type=ContentType.TEXT,
metadata={"document": "doc", "id": "id", "related": []},
page_content="content",
)
return mapper


@pytest.mark.asyncio
@patch("extractor_api_lib.impl.extractors.confluence_extractor.ConfluenceLoader")
async def test_aextract_content_supports_cql(mock_loader_cls, confluence_mapper):
"""Ensure the extractor forwards the CQL parameter to the loader."""
extractor = ConfluenceExtractor(mapper=confluence_mapper)
extraction_parameters = ExtractionParameters(
document_name="confluence_doc",
source_type="confluence",
kwargs=[
KeyValuePair(key="url", value="https://example.atlassian.net"),
KeyValuePair(key="token", value="token"),
KeyValuePair(key="cql", value="type=page"),
],
)

mock_loader_instance = MagicMock()
mock_loader_instance.load.return_value = [LangchainDocument(page_content="content", metadata={"title": "Doc"})]
mock_loader_cls.return_value = mock_loader_instance

results = await extractor.aextract_content(extraction_parameters)

assert len(results) == 1
confluence_mapper.map_document2informationpiece.assert_called_once()
loader_kwargs = mock_loader_cls.call_args.kwargs
assert loader_kwargs["cql"] == "type=page"
assert "space_key" not in loader_kwargs


@pytest.mark.asyncio
async def test_aextract_content_requires_space_key_or_cql(confluence_mapper):
"""The extractor must receive either a space key or a CQL expression."""
extractor = ConfluenceExtractor(mapper=confluence_mapper)
extraction_parameters = ExtractionParameters(
document_name="confluence_doc",
source_type="confluence",
kwargs=[
KeyValuePair(key="url", value="https://example.atlassian.net"),
KeyValuePair(key="token", value="token"),
],
)

with pytest.raises(ValueError, match="Either 'space_key' or 'cql' must be provided for Confluence extraction."):
await extractor.aextract_content(extraction_parameters)
20 changes: 15 additions & 5 deletions services/frontend/libs/admin-app/data-access/document.api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ axios.defaults.auth = {

// confluence configuration interface
export interface ConfluenceConfig {
spaceKey: string;
spaceKey?: string;
cql?: string;
token: string;
url: string;
maxPages?: number;
Expand Down Expand Up @@ -55,11 +56,20 @@ export class DocumentAPI {
static async loadConfluence(config: ConfluenceConfig): Promise<void> {
try {
// convert config to list of key/value items for backend
const payload = [
{ key: 'url', value: config.url },
const payload: { key: string; value: string }[] = [
{ key: 'url', value: config.url.trim() },
{ key: 'token', value: config.token },
{ key: 'space_key', value: config.spaceKey },
] as { key: string; value: string }[];
];

const spaceKey = config.spaceKey?.trim();
if (spaceKey) {
payload.push({ key: 'space_key', value: spaceKey });
}

const cql = config.cql?.trim();
if (cql) {
payload.push({ key: 'cql', value: cql });
}

if (typeof config.maxPages === 'number') {
payload.push({ key: 'max_pages', value: String(config.maxPages) });
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ const spaceKey = ref('');
const confluenceToken = ref('');
const confluenceUrl = ref('');
const maxPages = ref<number>();
const confluenceCql = ref('');

// sitemap configuration refs
const sitemapName = ref('');
Expand Down Expand Up @@ -75,7 +76,8 @@ const handleConfluenceUpload = () => {
spaceKey: spaceKey.value,
token: confluenceToken.value,
url: confluenceUrl.value,
maxPages: maxPages.value
maxPages: maxPages.value,
cql: confluenceCql.value,
});
}

Expand Down Expand Up @@ -182,13 +184,16 @@ const getErrorMessage = (errorType: string) => {
<label for="confluenceName" class="sr-only"> Confluence Name</label>
<input v-model="confluenceName" type="text" placeholder="Name" class="input input-bordered w-full" />
<label for="spaceKey" class="sr-only">Space key</label>
<input v-model="spaceKey" type="text" placeholder="Space key" class="input input-bordered w-full" />
<input v-model="spaceKey" type="text" placeholder="Space key (optional)" class="input input-bordered w-full" />
<label for="confluenceCql" class="sr-only">CQL</label>
<input v-model="confluenceCql" type="text" placeholder="CQL query (optional)" class="input input-bordered w-full" />
<label for="confluenceToken" class="sr-only">Token</label>
<input v-model="confluenceToken" type="password" placeholder="Token" class="input input-bordered w-full" />
<label for="maxPages" class="sr-only">Max pages</label>
<input v-model.number="maxPages" type="number" placeholder="Max number of pages" class="input input-bordered w-full" />
<input v-model.number="maxPages" type="number" placeholder="Max number of pages (optional)" class="input input-bordered w-full" />
</div>
<p class="text-xs opacity-50 mb-4">{{ t('documents.confluenceLoadDescription') }}</p>
<p class="text-xs opacity-50">{{ t('documents.confluenceLoadDescription') }}</p>
<p class="text-xs opacity-50 mb-4">{{ t('documents.confluenceQueryHint') }}</p>
<button class="btn btn-sm btn-accent" @click="handleConfluenceUpload">
{{ t('documents.loadConfluence') }}
</button>
Expand Down
5 changes: 3 additions & 2 deletions services/frontend/libs/i18n/admin/de.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@
"uploadingDocument": "Wird hochgeladen...",
"fileUpload": "Datei-Upload",
"confluenceUpload": "Confluence",
"confluenceLoadTitle": "Confluence-Seiten laden",
"confluenceLoadDescription": "Klicken Sie auf den Button unten, um Seiten aus Confluence zu laden",
"confluenceLoadTitle": "Confluence-Inhalte laden",
"confluenceLoadDescription": "Geben Sie Ihre Confluence-Zugangsdaten an und wählen Sie einen Space-Key oder einen CQL-Filter",
"confluenceQueryHint": "Lassen Sie die Felder leer, um den gesamten Bereich zu laden, oder geben Sie einen Confluence Query Language (CQL) Ausdruck zum Filtern der Seiten ein",
"loadConfluence": "Laden starten",
"fileTypeNotAllowedTitle": "Dateityp nicht erlaubt",
"fileTypeNotAllowedDescription": "Nur PDF-, DOCX-, PPTX- und XML-Dateien sind erlaubt",
Expand Down
5 changes: 3 additions & 2 deletions services/frontend/libs/i18n/admin/en.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
"fileTypeNotAllowedDescription": "Only PDF, DOCX, PPTX, and XML files are allowed",
"fileUpload": "File Upload",
"confluenceUpload": "Confluence",
"confluenceLoadTitle": "Load all Confluence pages from a space",
"confluenceLoadDescription": "Click the button below to load pages from Confluence",
"confluenceLoadTitle": "Load Confluence content",
"confluenceLoadDescription": "Provide your Confluence credentials and choose a space key or CQL filter",
"confluenceQueryHint": "Leave fields blank to load the whole space or supply a Confluence Query Language (CQL) expression to filter pages",
"loadConfluence": "Load Confluence",
"sitemapUpload": "Sitemap",
"sitemapLoadTitle": "Load content from a sitemap",
Expand Down
Loading