94 lines
2.7 KiB
YAML
94 lines
2.7 KiB
YAML
# Name of this instance
|
|
# Default value: atextcrawler
|
|
# Allowed values: arbitrary string
|
|
instance_name: atextcrawler
|
|
|
|
# Which kind of instance is this?
|
|
# Default value: prod
|
|
# Allowed values are:
|
|
# - 'dev': development instance
|
|
# - 'staging': staging instance
|
|
# - 'prod': production instance
|
|
instance_type: prod
|
|
|
|
# Log level
|
|
# Default value: info
|
|
# Allowed values: critical, error, warning, info, debug
|
|
log_level: info
|
|
|
|
# Plugins directory
|
|
# If given as relative path, it will be relative to the
|
|
# directory of this file (main.yaml).
|
|
# Read documentation on plugins.
|
|
# Default value: plugins
|
|
# Hint: Create a empty __init__.py in the plugins_dir.
|
|
plugins_dir: plugins
|
|
|
|
# Parameters for access to the PostgreSQL service
|
|
# No default values; must be set.
|
|
postgresql:
|
|
host: localhost
|
|
port: 5432
|
|
database: atextcrawler
|
|
user: atextcrawler
|
|
password: ________________________
|
|
|
|
# Crawling
|
|
crawl:
|
|
# Number of concurrent workers
|
|
# Default value: 10
|
|
# Allowed values: integer >=0 and <=1000
|
|
workers: 10
|
|
|
|
# Delay in seconds between attempts to fetch items
|
|
# from site_queue if the last attempt gave no item
|
|
# Also the delay in seconds after a worker has found
|
|
# no site to process
|
|
# Default value: 600
|
|
# Allowed values: positive number
|
|
#site_delay: 10
|
|
|
|
# Time interval in seconds between site updates when
|
|
# handling queued base URLs
|
|
# Default value: 3600
|
|
# Allowed values: positive number
|
|
#site_revisit_interval: 3600
|
|
|
|
# Delay in seconds between attempts to process
|
|
# individual resources (pages etc.) of a site
|
|
# Default value: 5
|
|
# Allowed values: positive number
|
|
#resource_delay: 3
|
|
|
|
# Default interval in seconds between full crawls of a site
|
|
# Default value: 864000 (10 days)
|
|
# Allowed values: positive number
|
|
#full_crawl_interval: 864000
|
|
|
|
# Default interval in seconds between feed crawls of a site
|
|
# Default value: 86400 (1 day)
|
|
# Allowed values: positive number
|
|
#feed_crawl_interval: 86400
|
|
|
|
# Minimum length of the text (in characters) extracted from
|
|
# a resource; resources with shorter texts are not stored.
|
|
# Default value: 300
|
|
# Allowed values: positive number
|
|
#min_text_length: 300
|
|
|
|
# Parameters for access to the ElasticSearch service
|
|
# No default values; must be set.
|
|
elasticsearch:
|
|
# host on which ES is running
|
|
host: localhost
|
|
# API key for accessing ES
|
|
api_key: "____________________"
|
|
# API user id
|
|
id: "____________________"
|
|
# Index base name (full index names will have '_text_{language}' appended)
|
|
index_base_name: atext
|
|
|
|
# Tensorflow access
|
|
tensorflow:
|
|
# The prediction endpoint of the model server's sentence model
|
|
model_server_endpoint: http://localhost:9000/v1/models/sentences:predict
|