View Javadoc
1   
2   package fr.paris.lutece.plugins.knowledge.service;
3   
4   import dev.langchain4j.data.document.*;
5   import dev.langchain4j.data.document.parser.*;
6   import dev.langchain4j.data.document.splitter.*;
7   import dev.langchain4j.data.embedding.*;
8   import dev.langchain4j.data.segment.TextSegment;
9   import dev.langchain4j.model.embedding.*;
10  import dev.langchain4j.model.openai.*;
11  import dev.langchain4j.model.output.Response;
12  import dev.langchain4j.store.embedding.*;
13  import dev.langchain4j.store.embedding.elasticsearch.ElasticsearchEmbeddingStore;
14  import fr.paris.lutece.plugins.knowledge.business.Dataset;
15  import fr.paris.lutece.plugins.knowledge.business.DatasetFile;
16  import fr.paris.lutece.portal.service.file.FileServiceException;
17  import fr.paris.lutece.portal.service.file.IFileStoreServiceProvider;
18  import fr.paris.lutece.portal.service.util.AppLogService;
19  
20  import java.io.InputStream;
21  import java.util.*;
22  import static dev.langchain4j.model.openai.OpenAiModelName.*;
23  import static java.time.Duration.*;
24  
25  public class ElasticStoreService
26  {
27      private static IFileStoreServiceProvider fileStoreService = DataSetService.getFileStoreServiceProvider( );
28      private static final Map<Integer, EmbeddingStore<TextSegment>> embeddingStores = new HashMap<>( );
29      private static final EmbeddingModel embeddingModel = OpenAiEmbeddingModel.builder( ).apiKey( Constant.API_KEY ).modelName( TEXT_EMBEDDING_ADA_002 )
30              .timeout( ofSeconds( 600 ) ).logRequests( true ).logResponses( true ).build( );
31  
32      /**
33       * Stores a file.
34       * 
35       * @param document
36       *            The document to store.
37       * @param projectId
38       *            The ID of the project.
39       */
40      public static void store( DatasetFile dataSetFile, Dataset dataSet )
41      {
42          String fileKey = dataSetFile.getFileKey( );
43          EmbeddingStore<TextSegment> projectEmbeddingStore;
44  
45          projectEmbeddingStore = embeddingStores.computeIfAbsent( dataSet.getId( ), key -> {
46              return getElasticsearchEmbeddingStore( dataSet.getId( ) );
47          } );
48  
49          try {
50              // get file
51  			InputStream file = fileStoreService.getInputStream( fileKey );
52  			Document document4j = parseDocument( file, dataSetFile.getName( ) );
53  
54  			// Generate embeddings
55  			List<TextSegment> segments = new DocumentByLineSplitter( dataSet.getRecordMaxTokens( ), 5 ).split( document4j );
56  
57  			Response<List<Embedding>> embeddings = embeddingModel.embedAll( segments );
58  
59  			// Store embeddings
60  			projectEmbeddingStore.addAll( embeddings.content( ), segments );
61  		} catch (FileServiceException e) {
62  			AppLogService.error(e);
63  		}
64  
65      }
66  
67      /**
68       * Gets the embedding sources for the relevant embeddings.
69       * 
70       * @param relevantEmbeddings
71       *            The relevant embeddings.
72       * @return The list of embedding sources.
73       */
74      private static Document parseDocument( InputStream inputStream, String fileName )
75      {
76          String extension = fileName.substring( fileName.lastIndexOf( "." ), fileName.length( ) );
77  
78          return extensionSwitcher( extension, inputStream );
79      }
80  
81      /**
82       * Switches on the extension of the file.
83       * 
84       * @param extension
85       *            The extension of the file.
86       * @param inputStream
87       *            The input stream of the file.
88       * @return The parsed document.
89       */
90      private static Document extensionSwitcher( String extension, InputStream inputStream )
91      {
92          switch( extension )
93          {
94              case ".pdf":
95                  return new PdfDocumentParser( ).parse( inputStream );
96              case ".docx":
97                  return new MsOfficeDocumentParser( DocumentType.DOC ).parse( inputStream );
98              case ".pptx":
99                  return new MsOfficeDocumentParser( DocumentType.PPT ).parse( inputStream );
100             case ".xlsx":
101                 return new MsOfficeDocumentParser( DocumentType.XLS ).parse( inputStream );
102             default:
103                 return new TextDocumentParser( DocumentType.TXT ).parse( inputStream );
104         }
105     }
106 
107     /**
108      * Gets the embedding store for the given project ID.
109      * 
110      * @param projectId
111      *            The ID of the project.
112      * @return The embedding store.
113      */
114     private static ElasticsearchEmbeddingStore getElasticsearchEmbeddingStore( int dateSetId )
115     {
116         // null basicauth credentials are managed by the builder
117         return ElasticsearchEmbeddingStore.builder( ).serverUrl( Constant.ELASTIC_URL ).userName( Constant.ELASTIC_USERNAME ).password( Constant.ELASTIC_PASSWORD )
118                 .indexName( "luteceai-embeddings-" + dateSetId ).build( );
119     };
120 
121     /**
122      * Gets the embedding store by project ID.
123      * 
124      * @param projectId
125      *            The ID of the project.
126      * @return The embedding store.
127      */
128     public static EmbeddingStore<TextSegment> getEmbeddingStore( int dateSetId )
129     {
130         if ( !embeddingStores.containsKey( dateSetId ) )
131         {
132             embeddingStores.put( dateSetId, getElasticsearchEmbeddingStore( dateSetId ) );
133         }
134 
135         return embeddingStores.get( dateSetId );
136     }
137 
138     /**
139      * Gets the embedding model.
140      * 
141      * @return The embedding model.
142      */
143     public static EmbeddingModel getEmbeddingModel( )
144     {
145         return embeddingModel;
146     }
147 
148 }