6060import com .oracle .graal .python .runtime .sequence .storage .SequenceStorage ;
6161import com .oracle .truffle .api .CompilerDirectives ;
6262import com .oracle .truffle .api .CompilerDirectives .TruffleBoundary ;
63+ import com .oracle .truffle .api .TruffleFile ;
6364import com .oracle .truffle .api .dsl .Cached ;
6465import com .oracle .truffle .api .dsl .Fallback ;
6566import com .oracle .truffle .api .dsl .GenerateNodeFactory ;
7071import java .io .File ;
7172import java .io .IOException ;
7273import java .io .InputStream ;
73- import java .util .Enumeration ;
74+ import java .nio .file .StandardOpenOption ;
75+ import java .util .ArrayList ;
7476import java .util .List ;
7577import java .util .zip .ZipEntry ;
76- import java .util .zip .ZipFile ;
78+ import java .util .zip .ZipInputStream ;
7779
7880@ CoreFunctions (extendClasses = PythonBuiltinClassType .PZipImporter )
7981public class ZipImporterBuiltins extends PythonBuiltins {
8082
8183 private static final String INIT_WAS_NOT_CALLED = "zipimporter.__init__() wasn't called" ;
8284
85+ /**
86+ * This stream is need to find locations of zip entries in the zip file. The main purpose of
87+ * this is to find location of the first local file header in the zipfile, which doesn't have to
88+ * be as ZipInputStream expects. Some of zip files (like .egg files) don't start with location
89+ * signature `PK\003\004` but with a code, that should be executed.
90+ *
91+ * In such case ZipInptuStream doesn't work, it just expects that the stream starts with the
92+ * location signature.
93+ *
94+ * This stream also improve performance of unzipping files in ZipImporter case. A content of
95+ * file is obtained from the zip, when it's needed (imported). The locations of zip entry
96+ * positions are cached in the zip directory cache. When content of a file is needed, then
97+ * previous zip entries are skipped and ZipInputStream is created from the required position.
98+ *
99+ * New ZipInputStream from this stream can be created after calling findFirstEntryPostion.
100+ *
101+ * It locates all occurrences of LOC signatures, even if a signature is a part of a content of a
102+ * file. This situation has to be handled separately.
103+ */
104+ private static class LOCZipEntryStream extends InputStream {
105+ // states of the simple lexer
106+ private static final byte AFTER_P = 1 ;
107+ private static final byte AFTER_PK = 2 ;
108+ private static final byte AFTER_PK3 = 3 ;
109+ private static final byte BEFORE_P = 0 ;
110+
111+ private byte state = BEFORE_P ; // the default state
112+ private static final byte [] LOC_SIG = new byte []{80 , 75 , 3 , 4 }; // zip location signature
113+
114+ private final InputStream in ;
115+ long pos = 0 ; // position in the input stream
116+ private boolean readFirstLoc ; // is the first location detected?
117+ List <Long > positions ; // store the locations
118+
119+ public LOCZipEntryStream (InputStream in ) {
120+ this .readFirstLoc = false ;
121+ this .positions = new ArrayList <>();
122+ this .in = in ;
123+ }
124+
125+ @ Override
126+ public int read () throws IOException {
127+ if (readFirstLoc ) {
128+ // This expect that the bytes of the first LOC was consumed by this stream
129+ // (due to calling findFirstEntryPosition) and now the stream
130+ // has to push back the LOC bytes
131+ int index = (int ) (pos - positions .get (0 ));
132+ if (index < LOC_SIG .length ) {
133+ pos ++;
134+ return LOC_SIG [index ];
135+ }
136+ readFirstLoc = false ; // never do it again
137+ }
138+ int ch = in .read ();
139+ pos ++;
140+ switch (state ) {
141+ case BEFORE_P :
142+ if (ch == LOC_SIG [0 ]) {
143+ state = AFTER_P ;
144+ }
145+ break ;
146+ case AFTER_P :
147+ if (ch == LOC_SIG [1 ]) {
148+ state = AFTER_PK ;
149+ } else {
150+ state = BEFORE_P ;
151+ }
152+ break ;
153+ case AFTER_PK :
154+ if (ch == LOC_SIG [2 ]) {
155+ state = AFTER_PK3 ;
156+ } else {
157+ state = BEFORE_P ;
158+ }
159+ break ;
160+ case AFTER_PK3 :
161+ if (ch == LOC_SIG [3 ]) {
162+ positions .add (pos - 4 ); // store the LOC position
163+ }
164+ state = BEFORE_P ;
165+ }
166+ return ch ;
167+ }
168+
169+ void findFirstEntryPosition () throws IOException {
170+ while (positions .isEmpty () && read () != -1 ) {
171+ // do nothing here, just read until the first LOC is found
172+ }
173+ if (!positions .isEmpty ()) {
174+ pos -= 4 ;
175+ readFirstLoc = true ;
176+ }
177+ }
178+ }
179+
83180 @ Override
84181 protected List <? extends NodeFactory <? extends PythonBuiltinBaseNode >> getNodeFactories () {
85182 return ZipImporterBuiltinsFactory .getFactories ();
@@ -99,65 +196,107 @@ private void initZipImporter(PZipImporter self, String path) {
99196 throw raise (PythonErrorType .ZipImportError , "archive path is empty" );
100197 }
101198
102- File file = new File (path );
199+ TruffleFile tfile = getContext (). getEnv (). getTruffleFile (path );
103200 String prefix = "" ;
104201 String archive = "" ;
105202 while (true ) {
106- File fullPathFile = new File (file .getPath ());
107- try {
108- if (fullPathFile .isFile ()) {
109- archive = file .getPath ();
110- break ;
111- }
112- } catch (SecurityException se ) {
113- // continue
203+ if (tfile .isRegularFile ()) {
204+ // we don't have to store absolute path
205+ archive = tfile .getPath ();
206+ break ;
114207 }
115-
116- // back up one path element
117- File parentFile = file .getParentFile ();
208+ TruffleFile parentFile = tfile .getParent ();
118209 if (parentFile == null ) {
119210 break ;
120211 }
121-
122- prefix = file .getName () + PZipImporter .SEPARATOR + prefix ;
123- file = parentFile ;
212+ prefix = tfile .getName () + PZipImporter .SEPARATOR + prefix ;
213+ tfile = parentFile ;
124214 }
125- ZipFile zipFile = null ;
126215
127- if (file .exists () && file .isFile ()) {
128- try {
129- zipFile = new ZipFile (file );
130- } catch (IOException e ) {
131- throw raise (PythonErrorType .ZipImportError , "not a Zip file" );
132- }
133- }
134- if (zipFile == null ) {
135- throw raise (PythonErrorType .ZipImportError , "not a Zip file" );
136- }
137- Object files = self .getZipDirectoryCache ().getItem (path );
138- if (files == null ) {
139- PDict filesDict = factory ().createDict ();
140- Enumeration <? extends ZipEntry > entries = zipFile .entries ();
141- while (entries .hasMoreElements ()) {
142- ZipEntry entry = entries .nextElement ();
143- PTuple tuple = factory ().createTuple (new Object []{
144- zipFile .getName () + PZipImporter .SEPARATOR + entry .getName (),
145- // for our implementation currently we don't need these
146- // these properties to store there. Keeping them for
147- // compatibility.
148- entry .getMethod (),
149- entry .getCompressedSize (),
150- entry .getSize (),
151- entry .getLastModifiedTime ().toMillis (),
152- entry .getCrc ()});
153- filesDict .setItem (entry .getName (), tuple );
216+ if (tfile .exists () && tfile .isRegularFile ()) {
217+ Object files = self .getZipDirectoryCache ().getItem (path );
218+ if (files == null ) {
219+ // fill the cache
220+ PDict filesDict = factory ().createDict ();
221+ ZipInputStream zis = null ;
222+ LOCZipEntryStream locis = null ;
223+ try {
224+ locis = new LOCZipEntryStream (tfile .newInputStream (StandardOpenOption .READ ));
225+ locis .findFirstEntryPosition (); // find location of the first zip entry
226+ if (locis .positions .isEmpty ()) {
227+ // no PK\003\004 found -> not a correct zip file
228+ throw raise (PythonErrorType .ZipImportError , "not a Zip file: '%s'" , archive );
229+ }
230+ zis = new ZipInputStream (locis ); // and create new ZipInput stream from this
231+ // location
232+ ZipEntry entry ;
233+
234+ // help variable to handle case when there LOC is in content of a file
235+ long lastZipEntryCSize = 0 ;
236+ long lastZipEntryPos = 0 ;
237+ int lastZipLocFileHeaderSize = 0 ;
238+ long zipEntryPos ;
239+
240+ byte [] extraField ;
241+ while ((entry = zis .getNextEntry ()) != null ) {
242+ zipEntryPos = locis .positions .remove (0 );
243+ // handles situation when the local file signature is
244+ // in the content of a file
245+ while (lastZipEntryPos + lastZipEntryCSize + lastZipLocFileHeaderSize > zipEntryPos ) {
246+ zipEntryPos = locis .positions .remove (0 );
247+ }
248+
249+ PTuple tuple = factory ().createTuple (new Object []{
250+ tfile .getPath () + PZipImporter .SEPARATOR + entry .getName (),
251+ // for our implementation currently we don't need these
252+ // these properties to store there. Keeping them for
253+ // compatibility.
254+ entry .getMethod (),
255+ lastZipEntryCSize = entry .getCompressedSize (),
256+ entry .getSize (),
257+ entry .getLastModifiedTime ().toMillis (),
258+ entry .getCrc (),
259+ // store the entry position for faster reading content
260+ lastZipEntryPos = zipEntryPos
261+ });
262+ filesDict .setItem (entry .getName (), tuple );
263+ // count local file header from the last zipentry
264+ lastZipLocFileHeaderSize = 30 + entry .getName ().length ();
265+ extraField = entry .getExtra ();
266+ if (extraField != null ) {
267+ lastZipLocFileHeaderSize += extraField .length ;
268+ }
269+ }
270+ } catch (IOException ex ) {
271+ throw raise (PythonErrorType .ZipImportError , "not a Zip file: '%s'" , archive );
272+ } finally {
273+ if (zis != null ) {
274+ try {
275+ zis .close ();
276+ } catch (IOException e ) {
277+ // just ignore it.
278+ }
279+ } else {
280+ if (locis != null ) {
281+ try {
282+ locis .close ();
283+ } catch (IOException e ) {
284+ // just ignore it.
285+ }
286+ }
287+ }
288+ }
289+ files = filesDict ;
290+ self .getZipDirectoryCache ().setItem (path , files );
154291 }
155- files = filesDict ;
156- self .getZipDirectoryCache ().setItem (path , files );
292+ self .setArchive (archive );
293+ self .setPrefix (prefix );
294+ self .setFiles ((PDict ) files );
295+
296+ } else {
297+ throw raise (PythonErrorType .ZipImportError , "not a Zip file: '%s'" , archive );
157298 }
158- self .setArchive (archive );
159- self .setPrefix (prefix );
160- self .setFiles ((PDict ) files );
299+
161300 }
162301
163302 @ Specialization
@@ -324,28 +463,35 @@ public PBytes doit(PZipImporter self, String pathname) {
324463 if (fileSize < 0 ) {
325464 throw raise (PythonErrorType .ZipImportError , "negative data size" );
326465 }
327- ZipFile zip = null ;
466+ long streamPosition = (long ) tocEntry .getArray ()[6 ];
467+ ZipInputStream zis = null ;
468+ TruffleFile tfile = getContext ().getEnv ().getTruffleFile (archive );
328469 try {
329- zip = new ZipFile (archive );
330- ZipEntry entry = zip .getEntry (key );
331- InputStream in = zip .getInputStream (entry );
470+ InputStream in = tfile .newInputStream (StandardOpenOption .READ );
471+ in .skip (streamPosition ); // we can fast skip bytes, because there is cached position
472+ // of the zip entry
473+ zis = new ZipInputStream (in );
474+ ZipEntry entry = zis .getNextEntry ();
475+ if (entry == null || !entry .getName ().equals (key )) {
476+ throw raise (PythonErrorType .ZipImportError , "zipimport: wrong cached file position" );
477+ }
332478 int byteSize = (int ) fileSize ;
333479 if (byteSize != fileSize ) {
334480 throw raise (PythonErrorType .ZipImportError , "zipimport: cannot read archive members large than 2GB" );
335481 }
336482 byte [] bytes = new byte [byteSize ];
337483 int bytesRead = 0 ;
338484 while (bytesRead < byteSize ) {
339- bytesRead += in .read (bytes , bytesRead , byteSize - bytesRead );
485+ bytesRead += zis .read (bytes , bytesRead , byteSize - bytesRead );
340486 }
341- in .close ();
487+ zis .close ();
342488 return factory ().createBytes (bytes );
343489 } catch (IOException e ) {
344490 throw raise (PythonErrorType .ZipImportError , "zipimport: can't read data" );
345491 } finally {
346- if (zip != null ) {
492+ if (zis != null ) {
347493 try {
348- zip .close ();
494+ zis .close ();
349495 } catch (IOException e ) {
350496 // just ignore it.
351497 }
0 commit comments