1
2 package fr.paris.lutece.plugins.knowledge.service;
3
4 import dev.langchain4j.data.document.*;
5 import dev.langchain4j.data.document.parser.*;
6 import dev.langchain4j.data.document.splitter.*;
7 import dev.langchain4j.data.embedding.*;
8 import dev.langchain4j.data.segment.TextSegment;
9 import dev.langchain4j.model.embedding.*;
10 import dev.langchain4j.model.openai.*;
11 import dev.langchain4j.model.output.Response;
12 import dev.langchain4j.store.embedding.*;
13 import dev.langchain4j.store.embedding.elasticsearch.ElasticsearchEmbeddingStore;
14 import fr.paris.lutece.plugins.knowledge.business.Dataset;
15 import fr.paris.lutece.plugins.knowledge.business.DatasetFile;
16 import fr.paris.lutece.portal.service.file.FileServiceException;
17 import fr.paris.lutece.portal.service.file.IFileStoreServiceProvider;
18 import fr.paris.lutece.portal.service.util.AppLogService;
19
20 import java.io.InputStream;
21 import java.util.*;
22 import static dev.langchain4j.model.openai.OpenAiModelName.*;
23 import static java.time.Duration.*;
24
25 public class ElasticStoreService
26 {
27 private static IFileStoreServiceProvider fileStoreService = DataSetService.getFileStoreServiceProvider( );
28 private static final Map<Integer, EmbeddingStore<TextSegment>> embeddingStores = new HashMap<>( );
29 private static final EmbeddingModel embeddingModel = OpenAiEmbeddingModel.builder( ).apiKey( Constant.API_KEY ).modelName( TEXT_EMBEDDING_ADA_002 )
30 .timeout( ofSeconds( 600 ) ).logRequests( true ).logResponses( true ).build( );
31
32
33
34
35
36
37
38
39
40 public static void store( DatasetFile dataSetFile, Dataset dataSet )
41 {
42 String fileKey = dataSetFile.getFileKey( );
43 EmbeddingStore<TextSegment> projectEmbeddingStore;
44
45 projectEmbeddingStore = embeddingStores.computeIfAbsent( dataSet.getId( ), key -> {
46 return getElasticsearchEmbeddingStore( dataSet.getId( ) );
47 } );
48
49 try {
50
51 InputStream file = fileStoreService.getInputStream( fileKey );
52 Document document4j = parseDocument( file, dataSetFile.getName( ) );
53
54
55 List<TextSegment> segments = new DocumentByLineSplitter( dataSet.getRecordMaxTokens( ), 5 ).split( document4j );
56
57 Response<List<Embedding>> embeddings = embeddingModel.embedAll( segments );
58
59
60 projectEmbeddingStore.addAll( embeddings.content( ), segments );
61 } catch (FileServiceException e) {
62 AppLogService.error(e);
63 }
64
65 }
66
67
68
69
70
71
72
73
74 private static Document parseDocument( InputStream inputStream, String fileName )
75 {
76 String extension = fileName.substring( fileName.lastIndexOf( "." ), fileName.length( ) );
77
78 return extensionSwitcher( extension, inputStream );
79 }
80
81
82
83
84
85
86
87
88
89
90 private static Document extensionSwitcher( String extension, InputStream inputStream )
91 {
92 switch( extension )
93 {
94 case ".pdf":
95 return new PdfDocumentParser( ).parse( inputStream );
96 case ".docx":
97 return new MsOfficeDocumentParser( DocumentType.DOC ).parse( inputStream );
98 case ".pptx":
99 return new MsOfficeDocumentParser( DocumentType.PPT ).parse( inputStream );
100 case ".xlsx":
101 return new MsOfficeDocumentParser( DocumentType.XLS ).parse( inputStream );
102 default:
103 return new TextDocumentParser( DocumentType.TXT ).parse( inputStream );
104 }
105 }
106
107
108
109
110
111
112
113
114 private static ElasticsearchEmbeddingStore getElasticsearchEmbeddingStore( int dateSetId )
115 {
116
117 return ElasticsearchEmbeddingStore.builder( ).serverUrl( Constant.ELASTIC_URL ).userName( Constant.ELASTIC_USERNAME ).password( Constant.ELASTIC_PASSWORD )
118 .indexName( "luteceai-embeddings-" + dateSetId ).build( );
119 };
120
121
122
123
124
125
126
127
128 public static EmbeddingStore<TextSegment> getEmbeddingStore( int dateSetId )
129 {
130 if ( !embeddingStores.containsKey( dateSetId ) )
131 {
132 embeddingStores.put( dateSetId, getElasticsearchEmbeddingStore( dateSetId ) );
133 }
134
135 return embeddingStores.get( dateSetId );
136 }
137
138
139
140
141
142
143 public static EmbeddingModel getEmbeddingModel( )
144 {
145 return embeddingModel;
146 }
147
148 }