diff --git a/libs/README.md b/libs/README.md index cf8efffb..ec608ee0 100644 --- a/libs/README.md +++ b/libs/README.md @@ -321,6 +321,8 @@ The following types of information can be extracted: - `TABLE`: data in tabular form found in the document - `IMAGE`: image found in the document +For Confluence sources, provide the instance `url` and API `token` and include either a `space_key` or a `cql` filter (empty values are ignored). Optional flags such as `include_attachments`, `keep_markdown_format`, and `keep_newlines` mirror the parameters supported by LangChain's `ConfluenceLoader`. + For sitemap sources, additional parameters can be provided, e.g.: - `web_path`: The URL of the XML sitemap to crawl diff --git a/libs/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py b/libs/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py index 0c025062..176dbf60 100644 --- a/libs/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py +++ b/libs/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py @@ -10,6 +10,8 @@ from extractor_api_lib.impl.mapper.confluence_langchain_document2information_piece import ( ConfluenceLangchainDocument2InformationPiece, ) +from langchain_community.document_loaders.confluence import ContentFormat + logger = logging.getLogger(__name__) @@ -54,11 +56,28 @@ async def aextract_content( A list of information pieces extracted from Confluence. """ # Convert list of key value pairs to dict - confluence_loader_parameters = { - x.key: int(x.value) if x.value.isdigit() else x.value for x in extraction_parameters.kwargs - } - if not confluence_loader_parameters.get("max_pages") or isinstance( - confluence_loader_parameters.get("max_pages"), str + confluence_loader_parameters = {} + for key_value in extraction_parameters.kwargs or []: + if key_value is None or key_value.key is None: + continue + + value = key_value.value + if isinstance(value, str): + value = value.strip() + if not value and key_value.key in {"space_key", "cql"}: + # Skip empty optional parameters + continue + if value.isdigit(): + value = int(value) + + confluence_loader_parameters[key_value.key] = value + + if "cql" not in confluence_loader_parameters and "space_key" not in confluence_loader_parameters: + raise ValueError("Either 'space_key' or 'cql' must be provided for Confluence extraction.") + if ( + "max_pages" in confluence_loader_parameters + and not confluence_loader_parameters.get("max_pages") + or isinstance(confluence_loader_parameters.get("max_pages"), str) ): logging.warning( "max_pages parameter is not set or invalid discarding it. ConfluenceLoader will use default value." @@ -67,6 +86,7 @@ async def aextract_content( # Drop the document_name parameter as it is not used by the ConfluenceLoader if "document_name" in confluence_loader_parameters: confluence_loader_parameters.pop("document_name", None) + confluence_loader_parameters["content_format"] = ContentFormat.VIEW document_loader = ConfluenceLoader(**confluence_loader_parameters) documents = document_loader.load() return [self._mapper.map_document2informationpiece(x, extraction_parameters.document_name) for x in documents] diff --git a/libs/extractor-api-lib/src/extractor_api_lib/models/confluence_parameters.py b/libs/extractor-api-lib/src/extractor_api_lib/models/confluence_parameters.py index 2c6b6043..66ad4290 100644 --- a/libs/extractor-api-lib/src/extractor_api_lib/models/confluence_parameters.py +++ b/libs/extractor-api-lib/src/extractor_api_lib/models/confluence_parameters.py @@ -33,7 +33,11 @@ class ConfluenceParameters(BaseModel): url: StrictStr = Field(description="url of the confluence space.") token: StrictStr = Field(description="api key to access confluence.") - space_key: StrictStr = Field(description="the space key of the confluence pages.") + space_key: Optional[StrictStr] = Field(default=None, description="the space key of the confluence pages.") + cql: Optional[StrictStr] = Field( + default=None, + description="Optional Confluence Query Language (CQL) expression used to filter pages.", + ) include_attachments: Optional[StrictBool] = Field( default=False, description="whether to include file attachments (e.g., images, documents) in the parsed content. Default is `false`.", @@ -55,6 +59,7 @@ class ConfluenceParameters(BaseModel): "url", "token", "space_key", + "cql", "include_attachments", "keep_markdown_format", "keep_newlines", @@ -120,6 +125,7 @@ def from_dict(cls, obj: Dict) -> Self: "url": obj.get("url"), "token": obj.get("token"), "space_key": obj.get("space_key"), + "cql": obj.get("cql"), "include_attachments": ( obj.get("include_attachments") if obj.get("include_attachments") is not None else False ), diff --git a/libs/extractor-api-lib/tests/confluence_extractor_test.py b/libs/extractor-api-lib/tests/confluence_extractor_test.py new file mode 100644 index 00000000..9aa5def7 --- /dev/null +++ b/libs/extractor-api-lib/tests/confluence_extractor_test.py @@ -0,0 +1,68 @@ +"""Tests for the ConfluenceExtractor.""" + +import pytest +from unittest.mock import MagicMock, patch +from langchain_core.documents import Document as LangchainDocument + +from extractor_api_lib.impl.extractors.confluence_extractor import ConfluenceExtractor +from extractor_api_lib.models.extraction_parameters import ExtractionParameters +from extractor_api_lib.models.key_value_pair import KeyValuePair +from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece +from extractor_api_lib.impl.types.content_type import ContentType + + +@pytest.fixture +def confluence_mapper(): + """Return a mapper mock that produces predictable information pieces.""" + mapper = MagicMock() + mapper.map_document2informationpiece.return_value = InternalInformationPiece( + type=ContentType.TEXT, + metadata={"document": "doc", "id": "id", "related": []}, + page_content="content", + ) + return mapper + + +@pytest.mark.asyncio +@patch("extractor_api_lib.impl.extractors.confluence_extractor.ConfluenceLoader") +async def test_aextract_content_supports_cql(mock_loader_cls, confluence_mapper): + """Ensure the extractor forwards the CQL parameter to the loader.""" + extractor = ConfluenceExtractor(mapper=confluence_mapper) + extraction_parameters = ExtractionParameters( + document_name="confluence_doc", + source_type="confluence", + kwargs=[ + KeyValuePair(key="url", value="https://example.atlassian.net"), + KeyValuePair(key="token", value="token"), + KeyValuePair(key="cql", value="type=page"), + ], + ) + + mock_loader_instance = MagicMock() + mock_loader_instance.load.return_value = [LangchainDocument(page_content="content", metadata={"title": "Doc"})] + mock_loader_cls.return_value = mock_loader_instance + + results = await extractor.aextract_content(extraction_parameters) + + assert len(results) == 1 + confluence_mapper.map_document2informationpiece.assert_called_once() + loader_kwargs = mock_loader_cls.call_args.kwargs + assert loader_kwargs["cql"] == "type=page" + assert "space_key" not in loader_kwargs + + +@pytest.mark.asyncio +async def test_aextract_content_requires_space_key_or_cql(confluence_mapper): + """The extractor must receive either a space key or a CQL expression.""" + extractor = ConfluenceExtractor(mapper=confluence_mapper) + extraction_parameters = ExtractionParameters( + document_name="confluence_doc", + source_type="confluence", + kwargs=[ + KeyValuePair(key="url", value="https://example.atlassian.net"), + KeyValuePair(key="token", value="token"), + ], + ) + + with pytest.raises(ValueError, match="Either 'space_key' or 'cql' must be provided for Confluence extraction."): + await extractor.aextract_content(extraction_parameters) diff --git a/services/frontend/libs/admin-app/data-access/document.api.ts b/services/frontend/libs/admin-app/data-access/document.api.ts index f226a5fd..6efc6cc0 100644 --- a/services/frontend/libs/admin-app/data-access/document.api.ts +++ b/services/frontend/libs/admin-app/data-access/document.api.ts @@ -9,7 +9,8 @@ axios.defaults.auth = { // confluence configuration interface export interface ConfluenceConfig { - spaceKey: string; + spaceKey?: string; + cql?: string; token: string; url: string; maxPages?: number; @@ -55,11 +56,20 @@ export class DocumentAPI { static async loadConfluence(config: ConfluenceConfig): Promise { try { // convert config to list of key/value items for backend - const payload = [ - { key: 'url', value: config.url }, + const payload: { key: string; value: string }[] = [ + { key: 'url', value: config.url.trim() }, { key: 'token', value: config.token }, - { key: 'space_key', value: config.spaceKey }, - ] as { key: string; value: string }[]; + ]; + + const spaceKey = config.spaceKey?.trim(); + if (spaceKey) { + payload.push({ key: 'space_key', value: spaceKey }); + } + + const cql = config.cql?.trim(); + if (cql) { + payload.push({ key: 'cql', value: cql }); + } if (typeof config.maxPages === 'number') { payload.push({ key: 'max_pages', value: String(config.maxPages) }); diff --git a/services/frontend/libs/admin-app/feature-document/DocumentUploadContainer.vue b/services/frontend/libs/admin-app/feature-document/DocumentUploadContainer.vue index 87db96bf..5dd33aa2 100644 --- a/services/frontend/libs/admin-app/feature-document/DocumentUploadContainer.vue +++ b/services/frontend/libs/admin-app/feature-document/DocumentUploadContainer.vue @@ -22,6 +22,7 @@ const spaceKey = ref(''); const confluenceToken = ref(''); const confluenceUrl = ref(''); const maxPages = ref(); +const confluenceCql = ref(''); // sitemap configuration refs const sitemapName = ref(''); @@ -75,7 +76,8 @@ const handleConfluenceUpload = () => { spaceKey: spaceKey.value, token: confluenceToken.value, url: confluenceUrl.value, - maxPages: maxPages.value + maxPages: maxPages.value, + cql: confluenceCql.value, }); } @@ -182,13 +184,16 @@ const getErrorMessage = (errorType: string) => { - + + + - + -

{{ t('documents.confluenceLoadDescription') }}

+

{{ t('documents.confluenceLoadDescription') }}

+

{{ t('documents.confluenceQueryHint') }}

diff --git a/services/frontend/libs/i18n/admin/de.json b/services/frontend/libs/i18n/admin/de.json index 97709297..b4179ca6 100644 --- a/services/frontend/libs/i18n/admin/de.json +++ b/services/frontend/libs/i18n/admin/de.json @@ -10,8 +10,9 @@ "uploadingDocument": "Wird hochgeladen...", "fileUpload": "Datei-Upload", "confluenceUpload": "Confluence", - "confluenceLoadTitle": "Confluence-Seiten laden", - "confluenceLoadDescription": "Klicken Sie auf den Button unten, um Seiten aus Confluence zu laden", + "confluenceLoadTitle": "Confluence-Inhalte laden", + "confluenceLoadDescription": "Geben Sie Ihre Confluence-Zugangsdaten an und wählen Sie einen Space-Key oder einen CQL-Filter", + "confluenceQueryHint": "Lassen Sie die Felder leer, um den gesamten Bereich zu laden, oder geben Sie einen Confluence Query Language (CQL) Ausdruck zum Filtern der Seiten ein", "loadConfluence": "Laden starten", "fileTypeNotAllowedTitle": "Dateityp nicht erlaubt", "fileTypeNotAllowedDescription": "Nur PDF-, DOCX-, PPTX- und XML-Dateien sind erlaubt", diff --git a/services/frontend/libs/i18n/admin/en.json b/services/frontend/libs/i18n/admin/en.json index b1d145fe..f1f038fa 100644 --- a/services/frontend/libs/i18n/admin/en.json +++ b/services/frontend/libs/i18n/admin/en.json @@ -12,8 +12,9 @@ "fileTypeNotAllowedDescription": "Only PDF, DOCX, PPTX, and XML files are allowed", "fileUpload": "File Upload", "confluenceUpload": "Confluence", - "confluenceLoadTitle": "Load all Confluence pages from a space", - "confluenceLoadDescription": "Click the button below to load pages from Confluence", + "confluenceLoadTitle": "Load Confluence content", + "confluenceLoadDescription": "Provide your Confluence credentials and choose a space key or CQL filter", + "confluenceQueryHint": "Leave fields blank to load the whole space or supply a Confluence Query Language (CQL) expression to filter pages", "loadConfluence": "Load Confluence", "sitemapUpload": "Sitemap", "sitemapLoadTitle": "Load content from a sitemap",