@@ -26,6 +26,9 @@ const TIMEOUT = 10000;
26
26
27
27
export interface taskParams
6D47
span> {
28
28
url : URL ;
29
+ headersToForward : {
30
+ [ s : string ] : string ;
31
+ }
29
32
}
30
33
31
34
export interface taskResult {
@@ -264,22 +267,33 @@ class Renderer {
264
267
await extensionsPage . close ( ) ;
265
268
}
266
269
267
- private async _createNewPage ( ) {
268
- if ( this . _stopping ) {
269
- throw new Error ( "Called _createNewPage on a stopping Renderer" ) ;
270
- }
271
-
272
- const browser = await this . _getBrowser ( ) ;
273
- const context = await browser . createIncognitoBrowserContext ( ) ;
274
- const page = await context . newPage ( ) ;
275
-
276
- await page . setUserAgent ( "Algolia Crawler Renderscript" ) ;
277
- await page . setCacheEnabled ( false ) ;
278
- await page . setViewport ( { width : WIDTH , height : HEIGHT } ) ;
270
+ private async _defineRequestContextForPage ( {
271
+ page,
272
+ task
273
+ } : {
274
+ page : puppeteer . Page ,
275
+ task : taskParams
276
+ } ) {
277
+ const { url, headersToForward } = task ;
279
278
280
- /* Ignore useless resources */
281
279
await page . setRequestInterception ( true ) ;
282
- page . on ( "request" , async req => {
280
+ if ( headersToForward . cookie ) {
281
+ const cookies = headersToForward . cookie . split ( '; ' ) . map ( c => {
282
+ const [ key , ...v ] = c . split ( '=' ) ;
283
+ // url attribute is required because it is not possible set cookies on a blank page
284
+ // so page.setCookie would crash if no url is provided, since we start with a blank page
285
+ return { url : url . href , name : key , value : v . join ( '=' ) } ;
286
+ } ) ;
287
+ try {
288
+ await page . setCookie ( ...cookies )
289
+ }
290
+ catch ( e ) {
291
+ console . error ( 'failed to set cookie on page' , url ) ;
292
+ }
293
+ }
294
+
295
+ /* Ignore useless/dangerous resources */
296
+ page . on ( 'request' , async ( req : puppeteer . Request ) => {
283
297
// check for ssrf attempts
284
298
try {
285
299
await validateURL ( {
@@ -311,12 +325,34 @@ class Renderer {
311
325
return ;
312
326
}
313
327
// console.log(req.resourceType(), req.url());
328
+ if ( req . isNavigationRequest ( ) ) {
329
+ const headers = req . headers ( ) ;
330
+ await req . continue ( {
331
+ // headers ignore values set for `Cookie`, relies to page.setCookie instead
332
+ headers : { ...headers , ...headersToForward }
333
+ } ) ;
334
+ return ;
335
+ }
314
336
await req . continue ( ) ;
315
337
} catch ( e ) {
316
338
if ( ! e . message . match ( / R e q u e s t i s a l r e a d y h a n d l e d / ) ) throw e ;
317
339
// Ignore Request is already handled error
318
340
}
319
341
} ) ;
342
+ }
343
+
344
+ private async _createNewPage ( ) {
345
+ if ( this . _stopping ) {
346
+ throw new Error ( "Called _createNewPage on a stopping Renderer" ) ;
347
+ }
348
+
349
+ const browser = await this . _getBrowser ( ) ;
350
+ const context = await browser . createIncognitoBrowserContext ( ) ;
351
+ const page = await context . newPage ( ) ;
352
+
353
+ await page . setUserAgent ( "Algolia Crawler Renderscript" ) ;
354
+ await page . setCacheEnabled ( false ) ;
355
+ await page . setViewport ( { width : WIDTH , height : HEIGHT } ) ;
320
356
321
357
return { page, context } ;
322
358
}
@@ -326,10 +362,13 @@ class Renderer {
326
362
return await this . _pageBuffer . shift ( ) ! ;
327
363
}
328
364
329
- private async _processPage ( { url } : taskParams , taskId : string ) {
365
+ private async _processPage ( task : taskParams , taskId : string ) {
330
366
/* Setup */
367
+ const { url } = task ;
331
368
const { context, page } = await this . _newPage ( ) ;
332
369
370
+ await this . _defineRequestContextForPage ( { page, task } ) ;
371
+
333
372
let response : puppeteer . Response | null = null ;
334
373
let timeout = false ;
335
374
page . addListener ( "response" , ( r : puppeteer . Response ) => {
0 commit comments